rough draft (e1126cdb) | Commits | summarization / nicolas

Browse Code »

Commit e1126cdba70bd5287871ebbe89e9ae6635bb5a01

Authored by Mateusz Kopeć 9 years ago

0 parents

rough draft

Inline Side-by-side

Showing 28 changed files with 2105 additions and 0 deletions

.gitignore 0 → 100644

View file @e1126cd

		1	+++ a/.gitignore
		1	+# Created by .ignore support plugin (hsz.mobi)
		2	+### Java template
		3	+*.
		4	+target/
		5	+
		6	+# Mobile Tools for Java (J2ME)
		7	+.mtj.tmp/
		8	+
		9	+# Package Files #
		10	+*.jar
		11	+*.war
		12	+*.ear
		13	+
		14	+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
		15	+hs_err_pid*
		16	+
		17	+.idea
		18	+*.iml
0	\ No newline at end of file	19	\ No newline at end of file

nicolas-cli/pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-cli/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+ <parent>
		7	+ <artifactId>nicolas-container</artifactId>
		8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		9	+ <version>1.0-SNAPSHOT</version>
		10	+ </parent>
		11	+
		12	+ <artifactId>nicolas-cli</artifactId>
		13	+
		14	+</project>
0	\ No newline at end of file	15	\ No newline at end of file

nicolas-core/pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+ <parent>
		7	+ <artifactId>nicolas-container</artifactId>
		8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		9	+ <version>1.0-SNAPSHOT</version>
		10	+ </parent>
		11	+
		12	+ <artifactId>nicolas</artifactId>
		13	+
		14	+ <dependencies>
		15	+ <dependency>
		16	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		17	+ <artifactId>nicolas-model</artifactId>
		18	+ <version>${project.version}</version>
		19	+ <scope>runtime</scope>
		20	+ </dependency>
		21	+ </dependencies>
		22	+</project>
0	\ No newline at end of file	23	\ No newline at end of file

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
		1	+package pl.waw.ipipan.zil.summ.nicolas;
		2	+
		3	+import weka.classifiers.Classifier;
		4	+import weka.classifiers.trees.RandomForest;
		5	+
		6	+
		7	+public class Constants {
		8	+
		9	+ public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";
		10	+ public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";
		11	+ public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";
		12	+ public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";
		13	+
		14	+ private Constants() {
		15	+ }
		16	+
		17	+ public static Classifier getClassifier() {
		18	+ RandomForest classifier = new RandomForest();
		19	+ classifier.setNumIterations(250);
		20	+ classifier.setSeed(0);
		21	+ classifier.setNumExecutionSlots(8);
		22	+ return classifier;
		23	+ }
		24	+
		25	+
		26	+ public static Classifier getSentencesClassifier() {
		27	+ RandomForest classifier = new RandomForest();
		28	+ classifier.setNumIterations(250);
		29	+ classifier.setSeed(0);
		30	+ classifier.setNumExecutionSlots(8);
		31	+ return classifier;
		32	+ }
		33	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
		1	+package pl.waw.ipipan.zil.summ.nicolas;
		2	+
		3	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		4	+
		5	+public class Nicolas {
		6	+
		7	+ public String summarizeThrift(TText text, int targetTokenCount) {
		8	+ return "test nicolas";
		9	+ }
		10	+
		11	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java
		1	+package pl.waw.ipipan.zil.summ.nicolas;
		2	+
		3	+import com.google.common.base.Charsets;
		4	+import com.google.common.collect.Lists;
		5	+import com.google.common.collect.Maps;
		6	+import com.google.common.collect.Sets;
		7	+import com.google.common.io.Files;
		8	+import org.slf4j.Logger;
		9	+import org.slf4j.LoggerFactory;
		10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		11	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
		12	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		13	+import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
		14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
		15	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
		16	+import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
		17	+import weka.classifiers.Classifier;
		18	+import weka.core.Attribute;
		19	+import weka.core.DenseInstance;
		20	+import weka.core.Instance;
		21	+import weka.core.Instances;
		22	+
		23	+import java.io.File;
		24	+import java.io.FileInputStream;
		25	+import java.io.IOException;
		26	+import java.io.ObjectInputStream;
		27	+import java.util.*;
		28	+import java.util.function.Function;
		29	+import java.util.stream.Collectors;
		30	+
		31	+import static java.util.stream.Collectors.toList;
		32	+
		33	+public class Utils {
		34	+
		35	+ private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
		36	+
		37	+ private static final String DATASET_NAME = "Dataset";
		38	+
		39	+ public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
		40	+ List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
		41	+ Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
		42	+
		43	+ LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
		44	+ Map<TMention, Instance> mention2instance = Maps.newHashMap();
		45	+ for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
		46	+ Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
		47	+ Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
		48	+ for (Attribute attribute : featureExtractor.getAttributesList()) {
		49	+ instance.setValue(attribute, mentionFeatures.get(attribute));
		50	+ }
		51	+ mention2instance.put(tMention, instance);
		52	+ }
		53	+ return mention2instance;
		54	+ }
		55	+
		56	+ public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
		57	+ List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
		58	+ Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
		59	+
		60	+ LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
		61	+ Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
		62	+ for (TSentence sentence : sentences) {
		63	+ Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
		64	+ Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
		65	+ for (Attribute attribute : featureExtractor.getAttributesList()) {
		66	+ instance.setValue(attribute, sentenceFeatures.get(attribute));
		67	+ }
		68	+ sentence2instance.put(sentence, instance);
		69	+ }
		70	+ return sentence2instance;
		71	+ }
		72	+
		73	+ public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
		74	+ Instances instances = new Instances(DATASET_NAME, attributesList, 0);
		75	+ instances.setClassIndex(0);
		76	+ return instances;
		77	+ }
		78	+
		79	+ public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException {
		80	+ LOG.info("Loading classifier...");
		81	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) {
		82	+ Classifier classifier = (Classifier) ois.readObject();
		83	+ LOG.info("Done. " + classifier.toString());
		84	+ return classifier;
		85	+ }
		86	+ }
		87	+
		88	+ public static Map<String, TText> loadPreprocessedTexts(String path) {
		89	+ Map<String, TText> id2text = Maps.newHashMap();
		90	+ for (File processedFullTextFile : new File(path).listFiles()) {
		91	+ TText processedFullText = loadThrifted(processedFullTextFile);
		92	+ id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText);
		93	+ }
		94	+ LOG.info(id2text.size() + " preprocessed texts found.");
		95	+ return id2text;
		96	+ }
		97	+
		98	+
		99	+ public static TText loadThrifted(File originalFile) {
		100	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) {
		101	+ return (TText) ois.readObject();
		102	+ } catch (ClassNotFoundException \| IOException e) {
		103	+ LOG.error("Error reading serialized file: " + e);
		104	+ return null;
		105	+ }
		106	+ }
		107	+
		108	+ public static List<String> tokenize(String text) {
		109	+ return Arrays.asList(text.split("[^\\p{L}0-9]+"));
		110	+ }
		111	+
		112	+ public static List<String> tokenizeOnWhitespace(String text) {
		113	+ return Arrays.asList(text.split(" +"));
		114	+ }
		115	+
		116	+ public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) {
		117	+ Map<TMention, String> mention2orth = Maps.newHashMap();
		118	+ for (TSentence s : sents) {
		119	+ Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth));
		120	+ Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace));
		121	+
		122	+ for (TMention m : s.getMentions()) {
		123	+ StringBuffer mentionOrth = new StringBuffer();
		124	+ for (String tokId : m.getHeadIds()) {
		125	+ if (!tokId2nps.get(tokId))
		126	+ mentionOrth.append(" ");
		127	+ mentionOrth.append(tokId2orth.get(tokId));
		128	+ }
		129	+ mention2orth.put(m, mentionOrth.toString().trim());
		130	+ }
		131	+ }
		132	+ return mention2orth;
		133	+ }
		134	+
		135	+ private static final Collection<String> STOPWORDS = Sets.newHashSet();
		136	+
		137	+ static {
		138	+ STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));
		139	+ }
		140	+
		141	+ public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
		142	+ Map<TMention, String> mention2orth = Maps.newHashMap();
		143	+ for (TSentence s : sents) {
		144	+ Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
		145	+
		146	+ for (TMention m : s.getMentions()) {
		147	+ StringBuffer mentionOrth = new StringBuffer();
		148	+ for (String tokId : m.getChildIds()) {
		149	+ TToken token = tokId2tok.get(tokId);
		150	+ if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
		151	+ continue;
		152	+ }
		153	+
		154	+ if (!token.isNoPrecedingSpace())
		155	+ mentionOrth.append(" ");
		156	+ mentionOrth.append(token.getOrth());
		157	+ }
		158	+ mention2orth.put(m, mentionOrth.toString().trim());
		159	+ }
		160	+ }
		161	+ return mention2orth;
		162	+ }
		163	+
		164	+ public static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
		165	+ Map<TMention, String> mention2base = Maps.newHashMap();
		166	+ for (TSentence s : sents) {
		167	+ Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase()));
		168	+
		169	+ for (TMention m : s.getMentions()) {
		170	+ StringBuilder mentionBase = new StringBuilder();
		171	+ for (String tokId : m.getChildIds()) {
		172	+ mentionBase.append(" ");
		173	+ mentionBase.append(tokId2base.get(tokId));
		174	+ }
		175	+ mention2base.put(m, mentionBase.toString().toLowerCase().trim());
		176	+ }
		177	+ }
		178	+ return mention2base;
		179	+ }
		180	+
		181	+ public static String loadSentence2Orth(TSentence sentence) {
		182	+ StringBuilder sb = new StringBuilder();
		183	+ for (TToken token : sentence.getTokens()) {
		184	+ if (!token.isNoPrecedingSpace())
		185	+ sb.append(" ");
		186	+ sb.append(token.getOrth());
		187	+ }
		188	+ return sb.toString().trim();
		189	+ }
		190	+
		191	+ public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
		192	+ String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
		193	+
		194	+ MentionScorer scorer = new MentionScorer();
		195	+ Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
		196	+
		197	+ mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
		198	+ return mention2score.keySet();
		199	+ }
		200	+}
0	\ No newline at end of file	201	\ No newline at end of file

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.apply;
		2	+
		3	+import com.google.common.collect.Lists;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.collect.Sets;
		6	+import org.slf4j.Logger;
		7	+import org.slf4j.LoggerFactory;
		8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
		10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		11	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		12	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		13	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
		14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
		15	+import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
		16	+import weka.classifiers.Classifier;
		17	+import weka.core.Instance;
		18	+import weka.core.Instances;
		19	+
		20	+import java.io.BufferedWriter;
		21	+import java.io.File;
		22	+import java.io.FileWriter;
		23	+import java.util.*;
		24	+
		25	+import static java.util.stream.Collectors.toList;
		26	+
		27	+public class ApplyModel2 {
		28	+
		29	+ private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class);
		30	+
		31	+ private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test";
		32	+ private static final String TARGET_DIR = "summaries";
		33	+
		34	+ public static void main(String[] args) throws Exception {
		35	+ Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
		36	+ MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
		37	+
		38	+ Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
		39	+ SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
		40	+
		41	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH);
		42	+ int i = 1;
		43	+ double avgSize = 0;
		44	+ for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
		45	+ TText text = entry.getValue();
		46	+
		47	+ Set<TMention> goodMentions
		48	+ = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
		49	+
		50	+ int targetSize = calculateTargetSize(text);
		51	+ String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
		52	+ int size = Utils.tokenize(summary).size();
		53	+ avgSize += size;
		54	+ try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) {
		55	+ bw.append(summary);
		56	+ }
		57	+
		58	+ LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey());
		59	+ }
		60	+
		61	+ LOG.info("Avg size:" + avgSize / id2preprocessedText.size());
		62	+ }
		63	+
		64	+ private static int calculateTargetSize(TText text) {
		65	+ List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
		66	+ StringBuffer body = new StringBuffer();
		67	+ for (TSentence sent : sents)
		68	+ body.append(Utils.loadSentence2Orth(sent) + " ");
		69	+ int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
		70	+ return (int) (0.2 * tokenCount);
		71	+ }
		72	+
		73	+ private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
		74	+ List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
		75	+
		76	+ StringBuffer sb = new StringBuffer();
		77	+ for (TSentence sent : selectedSentences) {
		78	+ sb.append(" " + Utils.loadSentence2Orth(sent));
		79	+ }
		80	+ return sb.toString().trim();
		81	+ }
		82	+
		83	+ private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
		84	+
		85	+ List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
		86	+
		87	+ Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
		88	+ Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
		89	+
		90	+ Map<TSentence, Double> sentence2score = Maps.newHashMap();
		91	+ for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
		92	+ Instance instance = entry.getValue();
		93	+ instance.setDataset(instances);
		94	+ double score = sentenceClassifier.classifyInstance(instance);
		95	+ sentence2score.put(entry.getKey(), score);
		96	+ }
		97	+
		98	+ List<TSentence> sortedSents = Lists.newArrayList(sents);
		99	+ Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed());
		100	+
		101	+ int size = 0;
		102	+ Random r = new Random(1);
		103	+ Set<TSentence> summary = Sets.newHashSet();
		104	+ for (TSentence sent : sortedSents) {
		105	+ size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
		106	+ if (r.nextDouble() > 0.4 && size > targetSize)
		107	+ break;
		108	+ summary.add(sent);
		109	+ if (size > targetSize)
		110	+ break;
		111	+ }
		112	+ List<TSentence> selectedSentences = Lists.newArrayList();
		113	+ for (TSentence sent : sents) {
		114	+ if (summary.contains(sent))
		115	+ selectedSentences.add(sent);
		116	+ }
		117	+ return selectedSentences;
		118	+ }
		119	+
		120	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.features;
		2	+
		3	+import com.google.common.collect.*;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import weka.core.Attribute;
		7	+
		8	+import java.util.*;
		9	+
		10	+public class FeatureExtractor {
		11	+
		12	+ protected static final Logger LOG = LoggerFactory.getLogger(FeatureExtractor.class);
		13	+
		14	+ private final List<Attribute> sortedAttributes = Lists.newArrayList();
		15	+
		16	+ private final BiMap<String, Attribute> name2attribute = HashBiMap.create();
		17	+
		18	+ private final Set<String> normalizedAttributes = Sets.newHashSet();
		19	+
		20	+ public ArrayList<Attribute> getAttributesList() {
		21	+ return Lists.newArrayList(sortedAttributes);
		22	+ }
		23	+
		24	+ protected Attribute getAttributeByName(String name) {
		25	+ return name2attribute.get(name);
		26	+ }
		27	+
		28	+ protected void addNumericAttribute(String attributeName) {
		29	+ name2attribute.put(attributeName, new Attribute(attributeName));
		30	+ }
		31	+
		32	+ protected void addBinaryAttribute(String attributeName) {
		33	+ name2attribute.put(attributeName, new Attribute(attributeName, Lists.newArrayList("f", "t")));
		34	+ }
		35	+
		36	+ protected void addNominalAttribute(String attributeName, List<String> values) {
		37	+ name2attribute.put(attributeName, new Attribute(attributeName, values));
		38	+ }
		39	+
		40	+ protected void addNumericAttributeNormalized(String attributeName) {
		41	+ addNumericAttribute(attributeName);
		42	+ addNumericAttribute(attributeName + "_normalized");
		43	+ normalizedAttributes.add(attributeName);
		44	+ }
		45	+
		46	+ protected void fillSortedAttributes(String scoreAttName) {
		47	+ sortedAttributes.addAll(name2attribute.values());
		48	+ sortedAttributes.remove(getAttributeByName(scoreAttName));
		49	+ Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2)));
		50	+ sortedAttributes.add(0, getAttributeByName(scoreAttName));
		51	+ }
		52	+
		53	+ protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) {
		54	+ Map<Attribute, Double> attribute2max = Maps.newHashMap();
		55	+ Map<Attribute, Double> attribute2min = Maps.newHashMap();
		56	+ for (T entity : entity2attributes.keySet()) {
		57	+ Map<Attribute, Double> entityAttributes = entity2attributes.get(entity);
		58	+ for (String attributeName : normalizedAttributes) {
		59	+ Attribute attribute = getAttributeByName(attributeName);
		60	+ Double value = entityAttributes.get(attribute);
		61	+
		62	+ attribute2max.putIfAbsent(attribute, Double.MIN_VALUE);
		63	+ attribute2max.compute(attribute, (k, v) -> Math.max(v, value));
		64	+
		65	+ attribute2min.putIfAbsent(attribute, Double.MAX_VALUE);
		66	+ attribute2min.compute(attribute, (k, v) -> Math.min(v, value));
		67	+ }
		68	+ }
		69	+ for (T mention : entity2attributes.keySet()) {
		70	+ Map<Attribute, Double> entityAttributes = entity2attributes.get(mention);
		71	+ for (Attribute attribute : attribute2max.keySet()) {
		72	+ Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
		73	+ entityAttributes.put(normalizedAttribute,
		74	+ (entityAttributes.get(attribute) - attribute2min.get(attribute))
		75	+ / (attribute2max.get(attribute) - attribute2min.get(attribute)));
		76	+ }
		77	+ }
		78	+ }
		79	+
		80	+ protected double toBinary(boolean bool) {
		81	+ return bool ? 1.0 : 0.0;
		82	+ }
		83	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.features;
		2	+
		3	+import com.google.common.collect.Maps;
		4	+import com.google.common.collect.Sets;
		5	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		7	+
		8	+import java.util.List;
		9	+import java.util.Map;
		10	+import java.util.Set;
		11	+import java.util.function.Function;
		12	+import java.util.stream.Collectors;
		13	+
		14	+import static java.util.stream.Collectors.toList;
		15	+import static java.util.stream.Collectors.toMap;
		16	+
		17	+/**
		18	+ * Created by me2 on 04.04.16.
		19	+ */
		20	+public class FeatureHelper {
		21	+
		22	+ private final List<TMention> mentions;
		23	+ private final Map<String, TMention> mentionId2mention;
		24	+ private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap();
		25	+ private final Map<TMention, TCoreference> mention2coref = Maps.newHashMap();
		26	+ private final Map<TMention, TSentence> mention2sent = Maps.newHashMap();
		27	+ private final Map<TMention, TParagraph> mention2par = Maps.newHashMap();
		28	+ private final Map<TMention, String> mention2Orth = Maps.newHashMap();
		29	+ private final Map<TMention, String> mention2Base = Maps.newHashMap();
		30	+ private final Map<TMention, TToken> mention2head = Maps.newHashMap();
		31	+ private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet();
		32	+
		33	+ private final Map<TMention, Integer> mention2Index = Maps.newHashMap();
		34	+ private final Map<TSentence, Integer> sent2Index = Maps.newHashMap();
		35	+ private final Map<TParagraph, Integer> par2Index = Maps.newHashMap();
		36	+ private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap();
		37	+ private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap();
		38	+ private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap();
		39	+
		40	+
		41	+ public FeatureHelper(TText preprocessedText) {
		42	+ mentions = preprocessedText.getParagraphs().stream()
		43	+ .flatMap(p -> p.getSentences().stream())
		44	+ .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList());
		45	+
		46	+ mentionId2mention = mentions.stream().collect(Collectors.toMap(TMention::getId, Function.identity()));
		47	+
		48	+ for (TCoreference coref : preprocessedText.getCoreferences()) {
		49	+ List<TMention> ments = coref.getMentionIds().stream().map(mentionId2mention::get).collect(toList());
		50	+ for (TMention m : ments) {
		51	+ mention2coref.put(m, coref);
		52	+ }
		53	+ coref2mentions.put(coref, ments);
		54	+ }
		55	+
		56	+ int parIdx = 0;
		57	+ int sentIdx = 0;
		58	+ int mentionIdx = 0;
		59	+ for (TParagraph par : preprocessedText.getParagraphs()) {
		60	+ Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences());
		61	+ mention2Orth.putAll(m2o);
		62	+ Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences());
		63	+ mention2Base.putAll(m2b);
		64	+
		65	+ int sentIdxInPar = 0;
		66	+ int mentionIdxInPar = 0;
		67	+ for (TSentence sent : par.getSentences()) {
		68	+
		69	+ Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity()));
		70	+
		71	+ Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap();
		72	+ for (TNamedEntity namedEntity : sent.getNames()) {
		73	+ for (String childId : namedEntity.getChildIds()) {
		74	+ tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet());
		75	+ tokenId2namedEntities.get(childId).add(namedEntity);
		76	+ }
		77	+ }
		78	+
		79	+ int mentionIdxInSent = 0;
		80	+ for (TMention mention : sent.getMentions()) {
		81	+ mention2sent.put(mention, sent);
		82	+ mention2par.put(mention, par);
		83	+ mention2Index.put(mention, mentionIdx++);
		84	+ mention2indexInSent.put(mention, mentionIdxInSent++);
		85	+ mention2indexInPar.put(mention, mentionIdxInPar++);
		86	+
		87	+ String firstHeadTokenId = mention.getHeadIds().iterator().next();
		88	+ mention2head.put(mention, tokenId2token.get(firstHeadTokenId));
		89	+ if (tokenId2namedEntities.containsKey(firstHeadTokenId))
		90	+ mentionsInNamedEntities.add(mention);
		91	+ }
		92	+ sent2Index.put(sent, sentIdx++);
		93	+ sent2IndexInPar.put(sent, sentIdxInPar++);
		94	+ }
		95	+
		96	+ par2Index.put(par, parIdx++);
		97	+ }
		98	+ }
		99	+
		100	+ public List<TMention> getMentions() {
		101	+ return mentions;
		102	+ }
		103	+
		104	+ public int getMentionIndexInChain(TMention mention) {
		105	+ return coref2mentions.get(mention2coref.get(mention)).indexOf(mention);
		106	+ }
		107	+
		108	+ public int getChainLength(TMention mention) {
		109	+ return coref2mentions.get(mention2coref.get(mention)).size();
		110	+ }
		111	+
		112	+ public String getSentenceLastTokenOrth(TSentence sent) {
		113	+ return sent.getTokens().get(sent.getTokensSize() - 1).getOrth();
		114	+ }
		115	+
		116	+ public String getMentionOrth(TMention mention) {
		117	+ return mention2Orth.get(mention);
		118	+ }
		119	+
		120	+ public String getMentionBase(TMention mention) {
		121	+ return mention2Base.get(mention);
		122	+ }
		123	+
		124	+ public int getMentionIndex(TMention mention) {
		125	+ return mention2Index.get(mention);
		126	+ }
		127	+
		128	+ public int getMentionIndexInSent(TMention mention) {
		129	+ return mention2indexInSent.get(mention);
		130	+ }
		131	+
		132	+ public int getMentionIndexInPar(TMention mention) {
		133	+ return mention2indexInPar.get(mention);
		134	+ }
		135	+
		136	+ public int getParIndex(TParagraph paragraph) {
		137	+ return par2Index.get(paragraph);
		138	+ }
		139	+
		140	+ public int getSentIndex(TSentence sent) {
		141	+ return sent2Index.get(sent);
		142	+ }
		143	+
		144	+ public int getSentIndexInPar(TSentence sent) {
		145	+ return sent2IndexInPar.get(sent);
		146	+ }
		147	+
		148	+ public TParagraph getMentionParagraph(TMention mention) {
		149	+ return mention2par.get(mention);
		150	+ }
		151	+
		152	+ public TSentence getMentionSentence(TMention mention) {
		153	+ return mention2sent.get(mention);
		154	+ }
		155	+
		156	+ public TMention getFirstChainMention(TMention mention) {
		157	+ return mentionId2mention.get(mention2coref.get(mention).getMentionIdsIterator().next());
		158	+ }
		159	+
		160	+ public TToken getMentionHeadToken(TMention mention) {
		161	+ return mention2head.get(mention);
		162	+ }
		163	+
		164	+ public boolean isMentionNamedEntity(TMention mention) {
		165	+ return mentionsInNamedEntities.contains(mention);
		166	+ }
		167	+
		168	+ public boolean isNested(TMention mention) {
		169	+ return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds()));
		170	+ }
		171	+
		172	+ public boolean isNesting(TMention mention) {
		173	+ return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds()));
		174	+ }
		175	+
		176	+ public Set<TCoreference> getClusters() {
		177	+ return coref2mentions.keySet();
		178	+ }
		179	+
		180	+ public Set<TMention> getCoreferentMentions(TMention tMention) {
		181	+ return getMentionCluster(tMention).getMentionIds().stream().map(this.mentionId2mention::get).collect(Collectors.toSet());
		182	+ }
		183	+
		184	+ public TCoreference getMentionCluster(TMention tMention) {
		185	+ return this.mention2coref.get(tMention);
		186	+ }
		187	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.features;
		2	+
		3	+import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
		4	+
		5	+
		6	+public class Interpretation {
		7	+ private String ctag = "null";
		8	+ private String casee = "null";
		9	+ private String gender = "null";
		10	+ private String number = "null";
		11	+ private String person = "null";
		12	+
		13	+ public Interpretation(TInterpretation chosenInterpretation) {
		14	+ ctag = chosenInterpretation.getCtag();
		15	+ String[] split = chosenInterpretation.getMsd().split(":");
		16	+ switch (ctag) {
		17	+ case "ger":
		18	+ case "subst":
		19	+ case "pact":
		20	+ case "ppas":
		21	+ case "num":
		22	+ case "numcol":
		23	+ case "adj":
		24	+ number = split[0];
		25	+ casee = split[1];
		26	+ gender = split[2];
		27	+ break;
		28	+ case "ppron12":
		29	+ case "ppron3":
		30	+ number = split[0];
		31	+ casee = split[1];
		32	+ gender = split[2];
		33	+ person = split[3];
		34	+ break;
		35	+ case "siebie":
		36	+ casee = split[0];
		37	+ break;
		38	+ case "fin":
		39	+ case "bedzie":
		40	+ case "aglt":
		41	+ case "impt":
		42	+ number = split[0];
		43	+ person = split[1];
		44	+ break;
		45	+ case "praet":
		46	+ case "winien":
		47	+ number = split[0];
		48	+ gender = split[1];
		49	+ break;
		50	+ case "prep":
		51	+ casee = split[0];
		52	+ break;
		53	+ default:
		54	+ break;
		55	+ }
		56	+ }
		57	+
		58	+ public String getCase() {
		59	+ return casee;
		60	+ }
		61	+
		62	+ public String getGender() {
		63	+ return gender;
		64	+ }
		65	+
		66	+ public String getNumber() {
		67	+ return number;
		68	+ }
		69	+
		70	+ public String getPerson() {
		71	+ return person;
		72	+ }
		73	+
		74	+ public String getCtag() {
		75	+ return ctag;
		76	+ }
		77	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
		2	+
		3	+import com.google.common.collect.*;
		4	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
		5	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
		6	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
		7	+import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
		8	+import weka.core.Attribute;
		9	+
		10	+import java.io.File;
		11	+import java.io.IOException;
		12	+import java.nio.file.Files;
		13	+import java.util.*;
		14	+import java.util.stream.Collectors;
		15	+import java.util.stream.Stream;
		16	+
		17	+
		18	+public class MentionFeatureExtractor extends FeatureExtractor {
		19	+
		20	+ private final List<String> frequentBases = Lists.newArrayList();
		21	+
		22	+ public MentionFeatureExtractor() {
		23	+
		24	+ //coref
		25	+ addNumericAttributeNormalized("chain_length");
		26	+
		27	+ // text characteristics
		28	+ addNumericAttribute("text_token_count");
		29	+ addNumericAttribute("text_sent_count");
		30	+ addNumericAttribute("text_par_count");
		31	+ addNumericAttribute("text_mention_count");
		32	+ addNumericAttribute("text_cluster_count");
		33	+
		34	+ //mention characteristics
		35	+ for (String prefix : Lists.newArrayList("mention", "chain_first_mention")) {
		36	+ // mention characteristics
		37	+ addNumericAttributeNormalized(prefix + "_index");
		38	+ addNumericAttributeNormalized(prefix + "_index_in_sent");
		39	+ addNumericAttributeNormalized(prefix + "_index_in_par");
		40	+ addNumericAttributeNormalized(prefix + "_index_in_chain");
		41	+ addBinaryAttribute(prefix + "_capitalized");
		42	+ addBinaryAttribute(prefix + "_all_caps");
		43	+ addNumericAttributeNormalized(prefix + "_char_count");
		44	+ addNumericAttributeNormalized(prefix + "_token_count");
		45	+ addBinaryAttribute(prefix + "_is_zero");
		46	+ addBinaryAttribute(prefix + "_is_named");
		47	+ addBinaryAttribute(prefix + "_is_pronoun");
		48	+ addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"));
		49	+ addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter"));
		50	+ addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc"));
		51	+ addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl"));
		52	+ addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n"));
		53	+
		54	+ // relation to other
		55	+ addBinaryAttribute(prefix + "_is_nested");
		56	+ addBinaryAttribute(prefix + "_is_nesting");
		57	+
		58	+ // par characteristics
		59	+ addNumericAttributeNormalized(prefix + "_par_idx");
		60	+ addNumericAttributeNormalized(prefix + "_par_token_count");
		61	+ addNumericAttributeNormalized(prefix + "_par_sent_count");
		62	+
		63	+ // sent characteristics
		64	+ addNumericAttributeNormalized(prefix + "_sent_token_count");
		65	+ addNumericAttributeNormalized(prefix + "_sent_mention_count");
		66	+ addNumericAttributeNormalized(prefix + "_sent_idx");
		67	+ addNumericAttributeNormalized(prefix + "_sent_idx_in_par");
		68	+ addBinaryAttribute(prefix + "_sent_ends_with_dot");
		69	+ addBinaryAttribute(prefix + "_sent_ends_with_questionmark");
		70	+
		71	+ // frequent bases
		72	+ loadFrequentBases();
		73	+ for (String base : frequentBases) {
		74	+ addBinaryAttribute(prefix + "_" + encodeBase(base));
		75	+ }
		76	+ }
		77	+
		78	+ addNominalAttribute("score", Lists.newArrayList("bad", "good"));
		79	+ fillSortedAttributes("score");
		80	+ }
		81	+
		82	+ private String encodeBase(String base) {
		83	+ return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
		84	+ }
		85	+
		86	+ private void loadFrequentBases() {
		87	+ try {
		88	+ Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath());
		89	+ this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList()));
		90	+ } catch (IOException e) {
		91	+ e.printStackTrace();
		92	+ }
		93	+ }
		94	+
		95	+ public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) {
		96	+ Map<TMention, Map<Attribute, Double>> result = Maps.newHashMap();
		97	+
		98	+ FeatureHelper helper = new FeatureHelper(preprocessedText);
		99	+
		100	+ addScoreFeature(result, helper.getMentions());
		101	+
		102	+ for (TMention mention : helper.getMentions()) {
		103	+ Map<Attribute, Double> attribute2value = result.get(mention);
		104	+
		105	+ //mention
		106	+ addMentionAttributes(helper, mention, attribute2value, "mention");
		107	+
		108	+ //first chain mention
		109	+ TMention firstChainMention = helper.getFirstChainMention(mention);
		110	+ addMentionAttributes(helper, firstChainMention, attribute2value, "chain_first_mention");
		111	+
		112	+ //coref
		113	+ attribute2value.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention));
		114	+
		115	+ //text
		116	+ List<TParagraph> pars = preprocessedText.getParagraphs();
		117	+ List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
		118	+ List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
		119	+ attribute2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
		120	+ attribute2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
		121	+ attribute2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
		122	+ attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size());
		123	+ attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
		124	+ attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
		125	+
		126	+ assert (attribute2value.size() == getAttributesList().size());
		127	+ }
		128	+ addNormalizedAttributeValues(result);
		129	+
		130	+ return result;
		131	+ }
		132	+
		133	+ private void addMentionAttributes(FeatureHelper helper, TMention mention, Map<Attribute, Double> attribute2value, String attributePrefix) {
		134	+ // mention characteristics
		135	+ attribute2value.put(getAttributeByName(attributePrefix + "_index"), (double) helper.getMentionIndex(mention));
		136	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));
		137	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_par"), (double) helper.getMentionIndexInPar(mention));
		138	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_chain"), (double) helper.getMentionIndexInChain(mention));
		139	+ attribute2value.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());
		140	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject()));
		141	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*")));
		142	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));
		143	+
		144	+ Interpretation interp = new Interpretation(helper.getMentionHeadToken(mention).getChosenInterpretation());
		145	+ addNominalAttributeValue(interp.getCtag(), attribute2value, attributePrefix + "_ctag");
		146	+ addNominalAttributeValue(interp.getPerson(), attribute2value, attributePrefix + "_person");
		147	+ addNominalAttributeValue(interp.getNumber(), attribute2value, attributePrefix + "_number");
		148	+ addNominalAttributeValue(interp.getGender(), attribute2value, attributePrefix + "_gender");
		149	+ addNominalAttributeValue(interp.getCase(), attribute2value, attributePrefix + "_case");
		150	+
		151	+ // relation to other mentions
		152	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention)));
		153	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention)));
		154	+
		155	+ String orth = helper.getMentionOrth(mention);
		156	+ attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1))));
		157	+ attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth)));
		158	+ attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length());
		159	+
		160	+ // par characteristics
		161	+ TParagraph mentionParagraph = helper.getMentionParagraph(mention);
		162	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_idx"), (double) helper.getParIndex(mentionParagraph));
		163	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_token_count"), mentionParagraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
		164	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_sent_count"), (double) mentionParagraph.getSentences().size());
		165	+
		166	+ // sent characteristics
		167	+ TSentence mentionSentence = helper.getMentionSentence(mention);
		168	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_token_count"), (double) mentionSentence.getTokensSize());
		169	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size());
		170	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence));
		171	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence));
		172	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals(".")));
		173	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?")));
		174	+
		175	+ // frequent bases
		176	+ String mentionBase = helper.getMentionBase(mention);
		177	+ for (String base : frequentBases) {
		178	+ attribute2value.put(getAttributeByName(attributePrefix + "_" + encodeBase(base)), toBinary(mentionBase.equals(base)));
		179	+ }
		180	+ }
		181	+
		182	+ private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
		183	+ Attribute att = getAttributeByName(attributeName);
		184	+ int index = att.indexOfValue(value);
		185	+ if (index == -1)
		186	+ LOG.warn(value + " not found for attribute " + attributeName);
		187	+ attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
		188	+ }
		189	+
		190	+
		191	+ private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) {
		192	+ for (TMention m : mentions) {
		193	+ Map<Attribute, Double> map = Maps.newHashMap();
		194	+ map.put(getAttributeByName("score"), weka.core.Utils.missingValue());
		195	+ result.put(m, map);
		196	+ }
		197	+ }
		198	+
		199	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
		2	+
		3	+import com.google.common.collect.Sets;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		8	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		9	+import weka.classifiers.Classifier;
		10	+import weka.core.Instance;
		11	+import weka.core.Instances;
		12	+
		13	+import java.util.Map;
		14	+import java.util.Set;
		15	+
		16	+public class MentionModel {
		17	+
		18	+ private static final Logger LOG = LoggerFactory.getLogger(MentionModel.class);
		19	+
		20	+ public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception {
		21	+ Set<TMention> goodMentions = Sets.newHashSet();
		22	+
		23	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
		24	+ Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor);
		25	+ for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
		26	+ Instance instance = entry.getValue();
		27	+ instance.setDataset(instances);
		28	+ instance.setClassMissing();
		29	+ boolean good = classifier.classifyInstance(instance) > 0.5;
		30	+ if (good)
		31	+ goodMentions.add(entry.getKey());
		32	+ }
		33	+ LOG.info("\t" + goodMentions.size() + "\t" + mention2instance.size());
		34	+ return goodMentions;
		35	+ }
		36	+
		37	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
		2	+
		3	+import com.google.common.collect.HashMultiset;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.collect.Multiset;
		6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
		8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		9	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		10	+
		11	+import java.util.Collection;
		12	+import java.util.List;
		13	+import java.util.Map;
		14	+import java.util.stream.Collectors;
		15	+
		16	+public class MentionScorer {
		17	+
		18	+
		19	+ public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
		20	+ Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
		21	+
		22	+ List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
		23	+ Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences);
		24	+
		25	+ return booleanTokenIntersection(mention2Orth, tokenCounts);
		26	+ }
		27	+
		28	+ private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
		29	+ Map<TMention, Double> mention2score = Maps.newHashMap();
		30	+ for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
		31	+ TMention mention = entry.getKey();
		32	+ String mentionOrth = mention2Orth.get(mention);
		33	+ for (String token : Utils.tokenize(mentionOrth)) {
		34	+ if (tokenCounts.contains(token.toLowerCase())) {
		35	+ mention2score.put(mention, 1.0);
		36	+ break;
		37	+ }
		38	+ }
		39	+ mention2score.putIfAbsent(mention, 0.0);
		40	+ }
		41	+ return mention2score;
		42	+ }
		43	+
		44	+ private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
		45	+ Map<TMention, Double> mention2score = Maps.newHashMap();
		46	+ for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
		47	+ TMention mention = entry.getKey();
		48	+ String mentionOrth = mention2Orth.get(mention);
		49	+ int present = 0;
		50	+ for (String token : Utils.tokenize(mentionOrth)) {
		51	+ if (tokenCounts.contains(token.toLowerCase())) {
		52	+ present++;
		53	+ }
		54	+ }
		55	+ mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0);
		56	+ }
		57	+ return mention2score;
		58	+ }
		59	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
		2	+
		3	+import com.google.common.base.Charsets;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.io.Files;
		6	+import org.apache.logging.log4j.LogManager;
		7	+import org.apache.logging.log4j.Logger;
		8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		10	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		11	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		12	+import weka.core.Instance;
		13	+import weka.core.Instances;
		14	+import weka.core.converters.ArffSaver;
		15	+
		16	+import java.io.File;
		17	+import java.io.IOException;
		18	+import java.util.Map;
		19	+
		20	+
		21	+public class PrepareTrainingData {
		22	+
		23	+ private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
		24	+
		25	+ public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
		26	+ public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
		27	+
		28	+ public static void main(String[] args) throws IOException {
		29	+
		30	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
		31	+ Map<String, String> id2optimalSummary = loadOptimalSummaries();
		32	+
		33	+ MentionScorer mentionScorer = new MentionScorer();
		34	+ MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
		35	+
		36	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
		37	+
		38	+ int i = 1;
		39	+ for (String textId : id2preprocessedText.keySet()) {
		40	+ LOG.info(i++ + "/" + id2preprocessedText.size());
		41	+
		42	+ TText preprocessedText = id2preprocessedText.get(textId);
		43	+ String optimalSummary = id2optimalSummary.get(textId);
		44	+ if (optimalSummary == null)
		45	+ continue;
		46	+ Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
		47	+
		48	+ Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor);
		49	+ for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
		50	+ TMention mention = entry.getKey();
		51	+ Instance instance = entry.getValue();
		52	+ instance.setDataset(instances);
		53	+ instance.setClassValue(mention2score.get(mention));
		54	+ instances.add(instance);
		55	+ }
		56	+ }
		57	+ saveInstancesToFile(instances);
		58	+ }
		59	+
		60	+ private static void saveInstancesToFile(Instances instances) throws IOException {
		61	+ ArffSaver saver = new ArffSaver();
		62	+ saver.setInstances(instances);
		63	+ saver.setFile(new File(Constants.MENTIONS_DATASET_PATH));
		64	+ saver.writeBatch();
		65	+ }
		66	+
		67	+ private static Map<String, String> loadOptimalSummaries() throws IOException {
		68	+ Map<String, String> id2optimalSummary = Maps.newHashMap();
		69	+ for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
		70	+ String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
		71	+ id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
		72	+ }
		73	+ LOG.info(id2optimalSummary.size() + " optimal summaries found.");
		74	+ return id2optimalSummary;
		75	+ }
		76	+
		77	+
		78	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
		2	+
		3	+import org.apache.commons.lang3.time.StopWatch;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		7	+import weka.classifiers.Classifier;
		8	+import weka.core.Instances;
		9	+import weka.core.converters.ArffLoader;
		10	+
		11	+import java.io.File;
		12	+import java.io.FileOutputStream;
		13	+import java.io.ObjectOutputStream;
		14	+
		15	+
		16	+public class TrainModel {
		17	+ private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
		18	+
		19	+ public static void main(String[] args) throws Exception {
		20	+
		21	+ ArffLoader loader = new ArffLoader();
		22	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
		23	+ Instances instances = loader.getDataSet();
		24	+ instances.setClassIndex(0);
		25	+ LOG.info(instances.size() + " instances loaded.");
		26	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
		27	+
		28	+ StopWatch watch = new StopWatch();
		29	+ watch.start();
		30	+
		31	+ Classifier classifier = Constants.getClassifier();
		32	+
		33	+ LOG.info("Building classifier...");
		34	+ classifier.buildClassifier(instances);
		35	+ LOG.info("...done.");
		36	+
		37	+ try (ObjectOutputStream oos = new ObjectOutputStream(
		38	+ new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) {
		39	+ oos.writeObject(classifier);
		40	+ }
		41	+
		42	+ watch.stop();
		43	+ LOG.info("Elapsed time: " + watch);
		44	+
		45	+ LOG.info(classifier.toString());
		46	+ }
		47	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention.test;
		2	+
		3	+import org.apache.commons.lang3.time.StopWatch;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		7	+import weka.classifiers.Classifier;
		8	+import weka.classifiers.evaluation.Evaluation;
		9	+import weka.core.Instances;
		10	+import weka.core.converters.ArffLoader;
		11	+
		12	+import java.io.File;
		13	+import java.util.Random;
		14	+
		15	+
		16	+public class Crossvalidate {
		17	+
		18	+ private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
		19	+
		20	+ public static void main(String[] args) throws Exception {
		21	+
		22	+ ArffLoader loader = new ArffLoader();
		23	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
		24	+ Instances instances = loader.getDataSet();
		25	+ instances.setClassIndex(0);
		26	+ LOG.info(instances.size() + " instances loaded.");
		27	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
		28	+
		29	+// while (instances.size() > 10000)
		30	+// instances.remove(instances.size() - 1);
		31	+
		32	+ StopWatch watch = new StopWatch();
		33	+ watch.start();
		34	+
		35	+ Classifier tree = Constants.getClassifier();
		36	+
		37	+ Evaluation eval = new Evaluation(instances);
		38	+ eval.crossValidateModel(tree, instances, 10, new Random(1));
		39	+ LOG.info(eval.toSummaryString());
		40	+
		41	+ watch.stop();
		42	+ LOG.info("Elapsed time: " + watch);
		43	+ }
		44	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.mention.test;
		2	+
		3	+import org.apache.commons.lang3.time.StopWatch;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		7	+import weka.classifiers.Classifier;
		8	+import weka.classifiers.evaluation.Evaluation;
		9	+import weka.core.Instances;
		10	+import weka.core.converters.ArffLoader;
		11	+
		12	+import java.io.File;
		13	+import java.io.FileInputStream;
		14	+import java.io.IOException;
		15	+import java.io.ObjectInputStream;
		16	+
		17	+/**
		18	+ * Created by me2 on 05.04.16.
		19	+ */
		20	+public class Validate {
		21	+ private static final Logger LOG = LoggerFactory.getLogger(Validate.class);
		22	+
		23	+ public static void main(String[] args) throws Exception {
		24	+
		25	+ ArffLoader loader = new ArffLoader();
		26	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
		27	+ Instances instances = loader.getDataSet();
		28	+ instances.setClassIndex(0);
		29	+ LOG.info(instances.size() + " instances loaded.");
		30	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
		31	+
		32	+ Classifier classifier = loadClassifier();
		33	+
		34	+ StopWatch watch = new StopWatch();
		35	+ watch.start();
		36	+
		37	+ Evaluation eval = new Evaluation(instances);
		38	+ eval.evaluateModel(classifier, instances);
		39	+
		40	+ LOG.info(eval.toSummaryString());
		41	+
		42	+ watch.stop();
		43	+ LOG.info("Elapsed time: " + watch);
		44	+ }
		45	+
		46	+ private static Classifier loadClassifier() throws IOException, ClassNotFoundException {
		47	+ LOG.info("Loading classifier...");
		48	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) {
		49	+ Classifier classifier = (Classifier) ois.readObject();
		50	+ LOG.info("Done. " + classifier.toString());
		51	+ return classifier;
		52	+ }
		53	+ }
		54	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
		2	+
		3	+import com.google.common.base.Charsets;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.io.Files;
		6	+import org.apache.logging.log4j.LogManager;
		7	+import org.apache.logging.log4j.Logger;
		8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
		9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
		10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		11	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		12	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		13	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
		14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
		15	+import weka.classifiers.Classifier;
		16	+import weka.core.Instance;
		17	+import weka.core.Instances;
		18	+import weka.core.converters.ArffSaver;
		19	+
		20	+import java.io.File;
		21	+import java.io.IOException;
		22	+import java.util.Map;
		23	+import java.util.Set;
		24	+
		25	+
		26	+public class PrepareTrainingData {
		27	+
		28	+ private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
		29	+
		30	+ private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
		31	+ private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
		32	+
		33	+ public static void main(String[] args) throws Exception {
		34	+
		35	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
		36	+ Map<String, String> id2optimalSummary = loadOptimalSummaries();
		37	+
		38	+ SentenceScorer sentenceScorer = new SentenceScorer();
		39	+ SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
		40	+
		41	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
		42	+
		43	+ Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
		44	+ MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
		45	+
		46	+ int i = 1;
		47	+ for (String textId : id2preprocessedText.keySet()) {
		48	+ LOG.info(i++ + "/" + id2preprocessedText.size());
		49	+
		50	+ TText preprocessedText = id2preprocessedText.get(textId);
		51	+ String optimalSummary = id2optimalSummary.get(textId);
		52	+ if (optimalSummary == null)
		53	+ continue;
		54	+ Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText);
		55	+
		56	+ Set<TMention> goodMentions
		57	+ = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText);
		58	+// Set<TMention> goodMentions
		59	+// = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
		60	+
		61	+ Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
		62	+ for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
		63	+ TSentence sentence = entry.getKey();
		64	+ Instance instance = entry.getValue();
		65	+ instance.setDataset(instances);
		66	+ instance.setClassValue(sentence2score.get(sentence));
		67	+ instances.add(instance);
		68	+ }
		69	+ }
		70	+ saveInstancesToFile(instances);
		71	+ }
		72	+
		73	+ private static void saveInstancesToFile(Instances instances) throws IOException {
		74	+ ArffSaver saver = new ArffSaver();
		75	+ saver.setInstances(instances);
		76	+ saver.setFile(new File(Constants.SENTENCES_DATASET_PATH));
		77	+ saver.writeBatch();
		78	+ }
		79	+
		80	+ private static Map<String, String> loadOptimalSummaries() throws IOException {
		81	+ Map<String, String> id2optimalSummary = Maps.newHashMap();
		82	+ for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
		83	+ String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
		84	+ id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
		85	+ }
		86	+ LOG.info(id2optimalSummary.size() + " optimal summaries found.");
		87	+ return id2optimalSummary;
		88	+ }
		89	+
		90	+
		91	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
		2	+
		3	+import com.google.common.collect.Maps;
		4	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
		5	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
		6	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
		7	+import weka.core.Attribute;
		8	+
		9	+import java.util.List;
		10	+import java.util.Map;
		11	+import java.util.Set;
		12	+import java.util.stream.Collectors;
		13	+
		14	+public class SentenceFeatureExtractor extends FeatureExtractor {
		15	+
		16	+ public SentenceFeatureExtractor() {
		17	+
		18	+ addNumericAttributeNormalized("sent_mention_cluster_count");
		19	+ addNumericAttributeNormalized("sent_good_mention_cluster_count");
		20	+ addNumericAttributeNormalized("sent_good_mention_cluster_good_count");
		21	+ addNumericAttributeNormalized("sent_cluster_count");
		22	+ addNumericAttributeNormalized("sent_good_cluster_count");
		23	+ addNumericAttributeNormalized("sent_mention_count");
		24	+ addNumericAttributeNormalized("sent_good_mention_count");
		25	+
		26	+ addNumericAttributeNormalized("sent_token_length");
		27	+ addNumericAttributeNormalized("sent_idx");
		28	+ addNumericAttributeNormalized("sent_idx_in_par");
		29	+ addBinaryAttribute("sent_ends_with_dot");
		30	+ addBinaryAttribute("sent_ends_with_questionmark");
		31	+
		32	+ addNumericAttributeNormalized("par_idx");
		33	+ addNumericAttributeNormalized("par_token_count");
		34	+ addNumericAttributeNormalized("par_sent_count");
		35	+
		36	+ addNumericAttribute("text_token_count");
		37	+ addNumericAttribute("text_sent_count");
		38	+ addNumericAttribute("text_par_count");
		39	+ addNumericAttribute("text_mention_count");
		40	+ addNumericAttribute("text_cluster_count");
		41	+
		42	+ addNumericAttribute("score");
		43	+ fillSortedAttributes("score");
		44	+ }
		45	+
		46	+ public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) {
		47	+
		48	+ int sentenceIdx = 0;
		49	+ int parIdx = 0;
		50	+
		51	+ FeatureHelper helper = new FeatureHelper(preprocessedText);
		52	+ List<TParagraph> pars = preprocessedText.getParagraphs();
		53	+ List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
		54	+ List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
		55	+
		56	+ Map<TSentence, Map<Attribute, Double>> sentence2features = Maps.newLinkedHashMap();
		57	+ for (TParagraph paragraph : preprocessedText.getParagraphs()) {
		58	+ int sentenceIdxInPar = 0;
		59	+ for (TSentence sentence : paragraph.getSentences()) {
		60	+ Map<Attribute, Double> feature2value = Maps.newHashMap();
		61	+
		62	+ feature2value.put(getAttributeByName("sent_mention_cluster_count"), sentence.getMentions().stream().mapToDouble(helper::getChainLength).sum());
		63	+ feature2value.put(getAttributeByName("sent_good_mention_cluster_count"), sentence.getMentions().stream().filter(goodMentions::contains).mapToDouble(helper::getChainLength).sum());
		64	+ feature2value.put(getAttributeByName("sent_good_mention_cluster_good_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).flatMap(m -> helper.getCoreferentMentions(m).stream()).filter(goodMentions::contains).count());
		65	+ feature2value.put(getAttributeByName("sent_cluster_count"), (double) sentence.getMentions().stream().map(helper::getMentionCluster).collect(Collectors.toSet()).size());
		66	+ feature2value.put(getAttributeByName("sent_good_cluster_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).map(helper::getMentionCluster).collect(Collectors.toSet()).size());
		67	+ feature2value.put(getAttributeByName("sent_mention_count"), (double) sentence.getMentions().size());
		68	+ feature2value.put(getAttributeByName("sent_good_mention_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).count());
		69	+
		70	+ feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size());
		71	+ feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar);
		72	+ feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx);
		73	+ feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals(".")));
		74	+ feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?")));
		75	+
		76	+ feature2value.put(getAttributeByName("par_idx"), (double) parIdx);
		77	+ feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
		78	+ feature2value.put(getAttributeByName("par_sent_count"), (double) paragraph.getSentences().size());
		79	+
		80	+ feature2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
		81	+ feature2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
		82	+ feature2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
		83	+ feature2value.put(getAttributeByName("text_par_count"), (double) pars.size());
		84	+ feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
		85	+ feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
		86	+
		87	+ feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
		88	+
		89	+ feature2value.remove(null);
		90	+ assert (feature2value.size() == getAttributesList().size());
		91	+
		92	+ sentence2features.put(sentence, feature2value);
		93	+
		94	+ sentenceIdx++;
		95	+ sentenceIdxInPar++;
		96	+ }
		97	+ parIdx++;
		98	+ }
		99	+ addNormalizedAttributeValues(sentence2features);
		100	+
		101	+ return sentence2features;
		102	+ }
		103	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
		2	+
		3	+import com.google.common.collect.HashMultiset;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.collect.Multiset;
		6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
		7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
		8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
		9	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		10	+
		11	+import java.util.List;
		12	+import java.util.Map;
		13	+
		14	+public class SentenceScorer {
		15	+ public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) {
		16	+ Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
		17	+
		18	+ Map<TSentence, Double> sentence2score = Maps.newHashMap();
		19	+ for (TParagraph paragraph : preprocessedText.getParagraphs())
		20	+ for (TSentence sentence : paragraph.getSentences()) {
		21	+ double score = 0.0;
		22	+
		23	+ String orth = Utils.loadSentence2Orth(sentence);
		24	+ List<String> tokens = Utils.tokenize(orth);
		25	+ for (String token : tokens) {
		26	+ score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0;
		27	+ }
		28	+ sentence2score.put(sentence, score / tokens.size());
		29	+ }
		30	+ return sentence2score;
		31	+ }
		32	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
		2	+
		3	+import org.apache.commons.lang3.time.StopWatch;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		7	+import weka.classifiers.Classifier;
		8	+import weka.core.Instances;
		9	+import weka.core.converters.ArffLoader;
		10	+
		11	+import java.io.File;
		12	+import java.io.FileOutputStream;
		13	+import java.io.ObjectOutputStream;
		14	+
		15	+
		16	+public class TrainModel {
		17	+ private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
		18	+
		19	+ public static void main(String[] args) throws Exception {
		20	+
		21	+ ArffLoader loader = new ArffLoader();
		22	+ loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
		23	+ Instances instances = loader.getDataSet();
		24	+ instances.setClassIndex(0);
		25	+ LOG.info(instances.size() + " instances loaded.");
		26	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
		27	+
		28	+ StopWatch watch = new StopWatch();
		29	+ watch.start();
		30	+
		31	+ Classifier classifier = Constants.getSentencesClassifier();
		32	+
		33	+ LOG.info("Building classifier...");
		34	+ classifier.buildClassifier(instances);
		35	+ LOG.info("...done.");
		36	+
		37	+ try (ObjectOutputStream oos = new ObjectOutputStream(
		38	+ new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) {
		39	+ oos.writeObject(classifier);
		40	+ }
		41	+
		42	+ watch.stop();
		43	+ LOG.info("Elapsed time: " + watch);
		44	+
		45	+ LOG.info(classifier.toString());
		46	+ }
		47	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.sentence.test;
		2	+
		3	+import org.apache.commons.lang3.time.StopWatch;
		4	+import org.slf4j.Logger;
		5	+import org.slf4j.LoggerFactory;
		6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
		7	+import weka.classifiers.Classifier;
		8	+import weka.classifiers.evaluation.Evaluation;
		9	+import weka.core.Instances;
		10	+import weka.core.converters.ArffLoader;
		11	+
		12	+import java.io.File;
		13	+import java.util.Random;
		14	+
		15	+
		16	+public class Crossvalidate {
		17	+
		18	+ private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
		19	+
		20	+ public static void main(String[] args) throws Exception {
		21	+
		22	+ ArffLoader loader = new ArffLoader();
		23	+ loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
		24	+ Instances instances = loader.getDataSet();
		25	+ instances.setClassIndex(0);
		26	+ LOG.info(instances.size() + " instances loaded.");
		27	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
		28	+
		29	+ StopWatch watch = new StopWatch();
		30	+ watch.start();
		31	+
		32	+ Classifier tree = Constants.getSentencesClassifier();
		33	+
		34	+ Evaluation eval = new Evaluation(instances);
		35	+ eval.crossValidateModel(tree, instances, 10, new Random(1));
		36	+ LOG.info(eval.toSummaryString());
		37	+
		38	+ watch.stop();
		39	+ LOG.info("Elapsed time: " + watch);
		40	+ }
		41	+}

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
		1	+package pl.waw.ipipan.zil.summ.nicolas.zero;
		2	+
		3	+import com.google.common.collect.Lists;
		4	+import com.google.common.collect.Maps;
		5	+import com.google.common.collect.Sets;
		6	+import org.apache.commons.csv.CSVFormat;
		7	+import org.apache.commons.csv.CSVPrinter;
		8	+import org.apache.commons.csv.QuoteMode;
		9	+import org.apache.commons.io.IOUtils;
		10	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
		11	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
		12	+
		13	+import java.io.File;
		14	+import java.io.FileReader;
		15	+import java.io.FileWriter;
		16	+import java.io.IOException;
		17	+import java.util.Arrays;
		18	+import java.util.List;
		19	+import java.util.Map;
		20	+import java.util.Set;
		21	+
		22	+/**
		23	+ * Created by me2 on 26.07.16.
		24	+ */
		25	+public class Zero {
		26	+
		27	+ private static final String IDS_PATH = "summaries_dev";
		28	+ private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/";
		29	+
		30	+ public static void main(String[] args) throws IOException {
		31	+
		32	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
		33	+ Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH);
		34	+
		35	+ int mentionCount = 0;
		36	+ int mentionInNom = 0;
		37	+ int mentionInNomSequential = 0;
		38	+
		39	+ List<List<Object>> rows = Lists.newArrayList();
		40	+ for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
		41	+ String textId = entry.getKey();
		42	+// System.out.println(id);
		43	+
		44	+ TText text = entry.getValue();
		45	+ List<String> sentenceIds = id2sentIds.get(textId);
		46	+// System.out.println(sentenceIds);
		47	+
		48	+ Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
		49	+ for (TCoreference coreference : text.getCoreferences()) {
		50	+ for (String mentionId : coreference.getMentionIds()) {
		51	+ mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
		52	+ }
		53	+ }
		54	+
		55	+ Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
		56	+ TSentence prevSentence = null;
		57	+ for (TParagraph p : text.getParagraphs()) {
		58	+ Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences());
		59	+
		60	+ for (TSentence sentence : p.getSentences()) {
		61	+ if (!sentenceIds.contains(sentence.getId()))
		62	+ continue;
		63	+ Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
		64	+
		65	+ Map<String, TToken> tokenId2Token = Maps.newHashMap();
		66	+ for (TToken t : sentence.getTokens())
		67	+ tokenId2Token.put(t.getId(), t);
		68	+
		69	+ for (TMention mention : sentence.getMentions()) {
		70	+ mentionCount++;
		71	+
		72	+ for (String tokenId : mention.getHeadIds()) {
		73	+ TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
		74	+ if (isInNominative(interp)) {
		75	+ mentionInNom++;
		76	+
		77	+ currentSentenceNominativeMentionIds.add(mention.getId());
		78	+ if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
		79	+ mentionInNomSequential++;
		80	+ System.out.println(tMentionStringMap.get(mention)
		81	+ + "\n\t" + Utils.loadSentence2Orth(prevSentence)
		82	+ + "\n\t" + Utils.loadSentence2Orth(sentence));
		83	+
		84	+ List<Object> row = Lists.newArrayList();
		85	+ row.add("C");
		86	+ row.add(textId);
		87	+ row.add(tMentionStringMap.get(mention));
		88	+ row.add(Utils.loadSentence2Orth(prevSentence));
		89	+ row.add(Utils.loadSentence2Orth(sentence));
		90	+ rows.add(row);
		91	+ }
		92	+ break;
		93	+ }
		94	+ }
		95	+ }
		96	+
		97	+ prevSentence = sentence;
		98	+ prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
		99	+ }
		100	+ }
		101	+ }
		102	+
		103	+ System.out.println(mentionCount + " mentions");
		104	+ System.out.println(mentionInNom + " mention in nom");
		105	+ System.out.println(mentionInNomSequential + " mention in nom with previous in nom");
		106	+
		107	+ try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
		108	+ for (List<Object> row : rows) {
		109	+ csvPrinter.printRecord(row);
		110	+ }
		111	+ }
		112	+
		113	+ }
		114	+
		115	+ private static boolean isInNominative(TInterpretation interp) {
		116	+ return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
		117	+ }
		118	+
		119	+ private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException {
		120	+ Map<String, List<String>> result = Maps.newHashMap();
		121	+ for (File f : new File(idsPath).listFiles()) {
		122	+ String id = f.getName().split("_")[0];
		123	+ List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
		124	+ result.put(id, sentenceIds);
		125	+ }
		126	+ return result;
		127	+ }
		128	+}

nicolas-model/pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-model/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+ <parent>
		7	+ <artifactId>nicolas-container</artifactId>
		8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		9	+ <version>1.0-SNAPSHOT</version>
		10	+ </parent>
		11	+
		12	+ <artifactId>nicolas-model</artifactId>
		13	+
		14	+</project>
0	\ No newline at end of file	15	\ No newline at end of file

nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt
		1	+on
		2	+to
		3	+co
		4	+rok
		5	+być
		6	+wszystko
		7	+polska
		8	+człowiek
		9	+sobie
		10	+raz
		11	+my
		12	+mieć
		13	+czas
		14	+państwo
		15	+praca
		16	+osoba
		17	+sprawa
		18	+ja
		19	+kraj
		20	+pieniądz
		21	+nikt
		22	+kto
		23	+przykład
		24	+nic
		25	+koniec
		26	+rząd
		27	+prawo
		28	+życie
		29	+miejsce
		30	+móc
		31	+fot
		32	+problem
		33	+władza
		34	+miesiąc
		35	+rzecz
		36	+stan
		37	+świat
		38	+wszyscy
		39	+mówić
		40	+rozmowa
		41	+coś
		42	+sytuacja
		43	+powód
		44	+początek
		45	+wiedzieć
		46	+dzień
		47	+uwaga
		48	+strona
		49	+udział
		50	+in
		51	+musieć
		52	+polityk
		53	+ktoś
		54	+ogół
		55	+polityka
		56	+chcieć
		57	+walka
		58	+zmiana
		59	+decyzja
		60	+ciąg
		61	+m .
		62	+pan
		63	+szansa
		64	+polak
		65	+przypadek
		66	+większość
		67	+pytanie
		68	+wzgląd
		69	+warszawa
		70	+proca
		71	+pomoc
		72	+prezydent
		73	+społeczeństwo
		74	+wynik
		75	+dziecko
		76	+prawda
		77	+związek
		78	+gospodarka
		79	+część
		80	+wojna
		81	+tydzień
		82	+granica
		83	+głos
		84	+przyszłość
		85	+autor
		86	+wybory
		87	+rynek
		88	+cel
		89	+ustawa
		90	+uważać
		91	+ten rok
		92	+droga
		93	+dom
		94	+rys
		95	+myśleć
		96	+firma
		97	+zasada
		98	+fakt
		99	+kolej
		100	+nadzieja
		101	+dolar
		102	+wraz
		103	+miasto
		104	+rozwój
		105	+ten sposób
		106	+europa
		107	+temat
		108	+siła
		109	+rodzina
		110	+minister
		111	+historia
		112	+wpływ
		113	+współpraca
		114	+środek
		115	+informacja
		116	+procent
		117	+wniosek
		118	+unia europejski
		119	+niemcy
		120	+podstawa
		121	+reforma
		122	+partia
		123	+interes
		124	+ten sprawa
		125	+kandydat
		126	+sukces
		127	+sposób
		128	+wątpliwość
		129	+złoty
		130	+sld
		131	+pracownik
		132	+stanowisko
		133	+dyskusja
		134	+telewizja
		135	+pewność
		136	+odpowiedź
		137	+rzeczywistość
		138	+program
		139	+cena
		140	+działanie
		141	+system
		142	+unia
		143	+ręka
		144	+odpowiedzialność
		145	+środowisko
		146	+solidarność
		147	+demokracja
		148	+maić
		149	+ramy
		150	+badanie
		151	+media
		152	+wartość
		153	+wybór
		154	+głowa
		155	+zostać
		156	+usa
		157	+pracować
		158	+porozumienie
		159	+widzieć
		160	+zdanie
		161	+akcja
		162	+wolność
		163	+spotkanie
		164	+przeszłość
		165	+stosunek
		166	+okazja
		167	+prowadzić
		168	+zachód
		169	+kobieta
		170	+obywatel
		171	+sąd
		172	+ubiegły rok
		173	+dziennikarz
		174	+kultura
		175	+grupa
		176	+opinia publiczny
		177	+obrona
		178	+bezpieczeństwo
		179	+opinia
		180	+rzeczpospolita
		181	+dokument
		182	+racja
		183	+szkoła
		184	+góra
		185	+warunek
		186	+organizacja
		187	+oko
		188	+godzina
		189	+tysiąc
		190	+ten czas
		191	+możliwość
		192	+błąd
		193	+ziemia
		194	+parlament
		195	+ten pora
		196	+chwila
		197	+naród
		198	+konflikt
		199	+działalność
		200	+sejm
		201	+powrót
		202	+premier
		203	+działać
		204	+rada
		205	+zdrowie
		206	+wiek
		207	+dodatek
		208	+poziom
		209	+widzenie
		210	+żyć
		211	+powiedzieć
		212	+inwestycja
		213	+rosja
		214	+niemiec
		215	+samochód
		216	+skutek
		217	+punkt
		218	+rola
		219	+mieszkaniec
		220	+wyborca
		221	+koszt
		222	+budżet
		223	+szef
		224	+styczeń
		225	+instytucja
		226	+pełnia
		227	+ulica
		228	+aws
		229	+ochrona
		230	+dostęp
		231	+zagrożenie
		232	+zgoda
		233	+ue
		234	+" rzeczpospolita "
		235	+liczba
		236	+wieś
		237	+połowa
0	\ No newline at end of file	238	\ No newline at end of file

nicolas-train/pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-train/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+ <parent>
		7	+ <artifactId>nicolas-container</artifactId>
		8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		9	+ <version>1.0-SNAPSHOT</version>
		10	+ </parent>
		11	+
		12	+ <artifactId>nicolas-train</artifactId>
		13	+
		14	+</project>
0	\ No newline at end of file	15	\ No newline at end of file

nicolas-zero/pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/nicolas-zero/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+ <parent>
		7	+ <artifactId>nicolas-container</artifactId>
		8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		9	+ <version>1.0-SNAPSHOT</version>
		10	+ </parent>
		11	+
		12	+ <artifactId>nicolas-zero</artifactId>
		13	+
		14	+</project>
0	\ No newline at end of file	15	\ No newline at end of file

pom.xml 0 → 100644

View file @e1126cd

		1	+++ a/pom.xml
		1	+<?xml version="1.0" encoding="UTF-8"?>
		2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
		3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
		5	+ <modelVersion>4.0.0</modelVersion>
		6	+
		7	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		8	+ <artifactId>nicolas-container</artifactId>
		9	+ <packaging>pom</packaging>
		10	+ <version>1.0-SNAPSHOT</version>
		11	+
		12	+ <modules>
		13	+ <module>nicolas-core</module>
		14	+ <module>nicolas-cli</module>
		15	+ <module>nicolas-model</module>
		16	+ <module>nicolas-train</module>
		17	+ <module>nicolas-zero</module>
		18	+ </modules>
		19	+
		20	+ <properties>
		21	+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		22	+ <java.version.build>1.8</java.version.build>
		23	+ </properties>
		24	+
		25	+ <prerequisites>
		26	+ <maven>3.0.5</maven>
		27	+ </prerequisites>
		28	+
		29	+ <developers>
		30	+ <developer>
		31	+ <name>Mateusz Kopeć</name>
		32	+ <organization>ICS PAS</organization>
		33	+ <email>m.kopec@ipipan.waw.pl</email>
		34	+ </developer>
		35	+ </developers>
		36	+
		37	+ <dependencies>
		38	+ <dependency>
		39	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
		40	+ <artifactId>pscapi</artifactId>
		41	+ <version>1.0-SNAPSHOT</version>
		42	+ </dependency>
		43	+ <dependency>
		44	+ <groupId>pl.waw.ipipan.zil.multiservice</groupId>
		45	+ <artifactId>utils</artifactId>
		46	+ <version>1.0-SNAPSHOT</version>
		47	+ </dependency>
		48	+
		49	+ <dependency>
		50	+ <groupId>org.apache.commons</groupId>
		51	+ <artifactId>commons-csv</artifactId>
		52	+ <version>1.3</version>
		53	+ </dependency>
		54	+ <dependency>
		55	+ <groupId>com.google.guava</groupId>
		56	+ <artifactId>guava</artifactId>
		57	+ <version>19.0</version>
		58	+ </dependency>
		59	+ <dependency>
		60	+ <groupId>nz.ac.waikato.cms.weka</groupId>
		61	+ <artifactId>weka-dev</artifactId>
		62	+ <version>3.9.0</version>
		63	+ </dependency>
		64	+ <dependency>
		65	+ <groupId>org.apache.commons</groupId>
		66	+ <artifactId>commons-lang3</artifactId>
		67	+ <version>3.4</version>
		68	+ </dependency>
		69	+ <dependency>
		70	+ <groupId>commons-io</groupId>
		71	+ <artifactId>commons-io</artifactId>
		72	+ <version>2.5</version>
		73	+ </dependency>
		74	+ </dependencies>
		75	+
		76	+
		77	+ <build>
		78	+ <plugins>
		79	+ <plugin>
		80	+ <groupId>org.apache.maven.plugins</groupId>
		81	+ <artifactId>maven-compiler-plugin</artifactId>
		82	+ <version>3.1</version>
		83	+ <configuration>
		84	+ <source>${java.version.build}</source>
		85	+ <target>${java.version.build}</target>
		86	+ </configuration>
		87	+ </plugin>
		88	+ </plugins>
		89	+ </build>
		90	+
		91	+ <distributionManagement>
		92	+ <repository>
		93	+ <id>deployment</id>
		94	+ <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
		95	+ </repository>
		96	+ <snapshotRepository>
		97	+ <id>deployment</id>
		98	+ <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
		99	+ </snapshotRepository>
		100	+ </distributionManagement>
		101	+</project>
0	\ No newline at end of file	102	\ No newline at end of file