counter.py 1.14 KB
# -*- coding: utf-8 -*-

import os

from lxml import etree
from natsort import natsorted

from preparator import ANNO_PATH


def count_words():
    anno_files = os.listdir(ANNO_PATH)
    anno_files = natsorted(anno_files)
    for filename in anno_files:
        if filename.endswith('.mmax'):
            words_count = 0
            textname = filename.replace('.mmax', '')
            words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
            tree = etree.parse(words_path)
            for word in tree.xpath("//word"):
                if word.attrib['ctag'] != 'interp':
                    words_count += 1
            print textname, words_count


def count_mentions():
    anno_files = os.listdir(ANNO_PATH)
    anno_files = natsorted(anno_files)
    for filename in anno_files:
        if filename.endswith('.mmax'):
            textname = filename.replace('.mmax', '')

            mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
            tree = etree.parse(mentions_path)
            mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
            print textname, len(mentions)