resolver.py 3.44 KB
# -*- coding: utf-8 -*-

import codecs
import os

import numpy as np

from natsort import natsorted

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import SGD, Adam

IN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
                                       'prepared_text_files'))
OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
                                       'metrics.csv'))

MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'weights_2017_05_10.h5'))


NUMBER_OF_FEATURES = 1126


def main():
    resolve_files()


def resolve_files():
    metrics_file = codecs.open(OUT_PATH, 'w', 'utf-8')
    write_labels(metrics_file)

    anno_files = os.listdir(IN_PATH)
    anno_files = natsorted(anno_files)
    for filename in anno_files:
        print (filename)
        textname = filename.replace('.csv', '')
        text_data_path = os.path.join(IN_PATH, filename)
        resolve(textname, text_data_path, metrics_file)

    metrics_file.close()


def write_labels(metrics_file):
    metrics_file.write('Text\tAccuracy\tPrecision\tRecall\tF1\tPairs\n')


def resolve(textname, text_data_path, metrics_file):
    raw_data = open(text_data_path, 'rt')
    test_data = np.loadtxt(raw_data, delimiter='\t')
    test_set = test_data[:, 0:NUMBER_OF_FEATURES]
    test_labels = test_data[:, NUMBER_OF_FEATURES]  # last column consists of labels

    inputs = Input(shape=(NUMBER_OF_FEATURES,))
    output_from_1st_layer = Dense(1000, activation='relu')(inputs)
    output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)
    output_from_1st_layer = BatchNormalization()(output_from_1st_layer)
    output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)
    output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)
    output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)
    output = Dense(1, activation='sigmoid')(output_from_2nd_layer)

    model = Model(inputs, output)
    model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.load_weights(MODEL)

    predictions = model.predict(test_set)

    calc_metrics(textname, test_set, test_labels, predictions, metrics_file)


def calc_metrics(textname, test_set, test_labels, predictions, metrics_file):
    true_positives = 0.0
    false_positives = 0.0
    true_negatives = 0.0
    false_negatives = 0.0

    for i in range(len(test_set)):
        if (predictions[i] < 0.5 and test_labels[i] == 0): true_negatives += 1
        if (predictions[i] < 0.5 and test_labels[i] == 1): false_negatives += 1
        if (predictions[i] >= 0.5 and test_labels[i] == 1): true_positives += 1
        if (predictions[i] >= 0.5 and test_labels[i] == 0): false_positives += 1

    accuracy = (true_positives + true_negatives) / len(test_set)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)

    metrics_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (textname,
                                                 repr(accuracy),
                                                 repr(precision),
                                                 repr(recall),
                                                 repr(f1),
                                                 repr(len(test_set))))


if __name__ == '__main__':
    main()