counter.py
1.14 KB
# -*- coding: utf-8 -*-
import os
from lxml import etree
from natsort import natsorted
from preparator import ANNO_PATH
def count_words():
anno_files = os.listdir(ANNO_PATH)
anno_files = natsorted(anno_files)
for filename in anno_files:
if filename.endswith('.mmax'):
words_count = 0
textname = filename.replace('.mmax', '')
words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
tree = etree.parse(words_path)
for word in tree.xpath("//word"):
if word.attrib['ctag'] != 'interp':
words_count += 1
print textname, words_count
def count_mentions():
anno_files = os.listdir(ANNO_PATH)
anno_files = natsorted(anno_files)
for filename in anno_files:
if filename.endswith('.mmax'):
textname = filename.replace('.mmax', '')
mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
tree = etree.parse(mentions_path)
mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
print textname, len(mentions)