get_examples.py 33.2 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import codecs
import operator
import os
import re
from subprocess import call
from tempfile import mkdtemp, mkstemp

from django.core.management.base import BaseCommand
from django.utils.encoding import smart_str
from django.db.models import Q

#import corpus2
from common.morfeusz import analyse

from dictionary.models import Argument, Lemma
from settings import PROJECT_PATH

BASE_PATH = os.path.join(PROJECT_PATH, 'data')
#['gotowe', 'sprawdzone', 'tymczasowy']
STATUSES_LS = [u'zalążkowe', u'gotowe', u'sprawdzone',
               u'(F) w obróbce', u'(F) gotowe', u'(F) sprawdzone',
               u'(S) w obróbce', u'(S) gotowe', u'(S) sprawdzone']

NOUN_TAGS = ['subst', 'ger']

#VERBTAGLIST = ['fin', 'praet', 'bedzie', 'inf', 'imps', 'impt',
#               'winien', 'pred']
#ADJTAGLIST = ['adj', 'pact', 'ppas']
#INTERPTAGLIST = ['interp']
#NUMERALTAGLIST = ['num', 'numcol']

XCES_HEADER = """<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE cesAna  SYSTEM 'xcesAnaIPI.dtd'><cesAna type="pre_morph" version="WROC-1.0" xmlns:xlink="http://www.w3.org/1999/xlink">
<chunkList xml:base="text.xml">
"""
XCES_FOOTER = """</chunkList>
</cesAna>
"""

WCRFT_CONFIG = 'nkjp_s2.ini'

LABELS = ('haslo',
          'status hasla',
          'identyfikator schematu',
          'schemat',
          'opinia o schemacie',
          'przyklad',
          'otagowany przyklad',
          'fragmenty przykladu',
          'opinia o przykladzie',
          'zrodlo przykladu',
          'wybor argumentow')


ARG_TYPES_BY_PRIORITY = ['fixed',
                    # frazy przyimkowe
                    'preplexnp', 'comprepnp', 'prepnp', 'prepncp', 'prepadjp',
                    # frazy rzeczownikowe
                    'lexnp', 'np',
                    # frazy rzeczownikowo-zdaniowe
                    'ncp', 'cp',
                    # adjp
                    'adjp',
                    # bezokoliczniki
                    'infp',
                    # refl
                    'refl',
                    # xp
                    'xp',
                    # advp
                    'advp',
                    # nonch
                    'nonch',
                    # lemma - nie jest sortowane chyba, bo dodawane na innym etapie niz reszta argumentow
                    'lemma',
                    # xp
                    'xp'
                    ]

class Command(BaseCommand):
    help = 'Get pinned examples from Slowal.'

    def handle(self, **options):
        get_examples()

def write_examples(q_statuses):
    try:
        examples_file = codecs.open(os.path.join(BASE_PATH,
                                                 'examples_gotowe_plus.txt'), 'wt', 'utf-8')
        for lemma in Lemma.objects.filter(old=False).filter(reduce(operator.or_, q_statuses)).order_by('entry').all():
            print lemma
            examples_file.write(lemma.entry+'\n')
            for frame in lemma.frames.order_by('text_rep').all():
                if lemma.frame_opinions.get(frame=frame).value.value != u'zła':
                    examples_file.write('\t%s\n' % frame.text_rep)
                    for example in lemma.nkjp_examples.filter(frame=frame):
                        examples_file.write('\t\t--> %s\n' % example.sentence)
            examples_file.write('\n\n')
    finally:
        examples_file.close()

def write_xces_opening(outfile):
    outfile.write(XCES_HEADER)

def write_xces_closing(outfile):
    outfile.write(XCES_FOOTER)

def write_paragraph(what, outfile):
    if len(what) > 0 and not what.isspace():
        outfile.write(u'<chunk type="p" id="p1">')
        outfile.write(what)
        outfile.write(u'</chunk>\n')

def sentence_to_xces(sentence):
    try:
        tmp_folder = mkdtemp()
        os.chdir(tmp_folder)
        tmp_file, tmpfilename = mkstemp(dir=tmp_folder)
        os.close(tmp_file)
        outfile = codecs.open(tmpfilename, 'wt', 'utf-8')
        write_xces_opening(outfile)
        write_paragraph(sentence, outfile)
        write_xces_closing(outfile)
    finally:
        outfile.close()
        return tmpfilename

def chunks(rdr):
    """Yields subsequent paragraphs from a reader."""
    while True:
        chunk = rdr.get_next_chunk()
        if not chunk:
            break
        yield chunk

#def tag_sentence(tagged_sentence_path):
#    sentences_count = 0
#    tagged_sentence_chunks = []
#    tagset = corpus2.get_named_tagset('nkjp')
#    rdr = corpus2.TokenReader.create_path_reader('xces', tagset, tagged_sentence_path)
#    for chunk in chunks(rdr):
#        for sent in chunk.sentences():
#            sentences_count += 1
#            for tok in sent.tokens():
#                prefered_lexeme = tok.get_preferred_lexeme(tagset)
#                base_form = prefered_lexeme.lemma_utf8().decode('utf-8')
#                orth_form = tok.orth_utf8().decode('utf-8')
#                tags = tagset.tag_to_string(prefered_lexeme.tag())
#                sentence_chunk = u'%s[%s>%s]' % (orth_form, base_form, tags)
#                tagged_sentence_chunks.append(sentence_chunk)
#    tagged_sentence = ' '.join(tagged_sentence_chunks)
#    if sentences_count > 1:
#        pass
#    return tagged_sentence

#def get_tagged_sentence(sentence):
#    tagged_sentence = 'Error!'
#    try:
#        tmp_folder = mkdtemp()
#        os.chdir(tmp_folder)
#        xces_file, xces_path = mkstemp(dir=tmp_folder)
#        os.close(xces_file)
#        tagged_sentence_file, tagged_sentence_path = mkstemp(dir=tmp_folder)
#        os.close(tagged_sentence_file)
#        xces_file = codecs.open(xces_path, 'wt', 'utf-8')
#        write_xces_opening(xces_file)
#        write_paragraph(sentence, xces_file)
#        write_xces_closing(xces_file)
#        xces_file.close()
#        try:
#            call(['wcrft', WCRFT_CONFIG, xces_path, '-O', tagged_sentence_path, '-C', '-i', 'premorph'])
#            tagged_sentence = tag_sentence(tagged_sentence_path)
#        except:
#            print 'Tagging failed.'
#    finally:
#        xces_file.close()
#        os.remove(xces_path)
#        os.remove(tagged_sentence_path)
#        return tagged_sentence

def write_detailed_examples(q_statuses):
    try:
        examples_file = codecs.open(os.path.join(BASE_PATH,
                                                 'detailed_examples_20150616.csv'), 'wt', 'utf-8')
        examples_file.write(u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % LABELS)
        for lemma in Lemma.objects.filter(old=False).filter(reduce(operator.or_, q_statuses)).order_by('entry').all():
            print lemma
            lemma_entry = lemma.entry
            lemma_status = lemma.status.status
            for frame in lemma.frames.order_by('text_rep').all():
                frame_text_rep = frame.text_rep
                frame_opinion = lemma.frame_opinions.filter(frame=frame).all()[0].value
                for example in lemma.nkjp_examples.filter(frame=frame):
                    sentence = example.sentence.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
                    #tagged_sentence = get_tagged_sentence(sentence) mozna wlaczyc w razie czego
                    tagged_sentence = ''
                    example_opinion = example.opinion.opinion
                    example_source = example.source.source
                    arguments_selection = u'%s' % u' + '.join([u'%s' % selection.__unicode__() for selection in example.arguments.all()])
                    examples_file.write(u'%s\t%s\t%d\t%s\t%s\t%s\t%s\t\t%s\t%s\t%s\n' % (lemma_entry,
                                                                                         lemma_status,
                                                                                         frame.id,
                                                                                         frame_text_rep,
                                                                                         frame_opinion,
                                                                                         sentence,
                                                                                         tagged_sentence,
                                                                                         example_opinion,
                                                                                         example_source,
                                                                                         arguments_selection))
    finally:
        examples_file.close()

def get_arguments(arguments_selection):
    arguments = []
    positions = arguments_selection.split('+')
    for position in positions:
        category = ''
        position = position.strip().lstrip('[').rstrip(']')
        if position.startswith('subj'):
            category = 'subj'
        elif position.startswith('obj'):
            category = 'obj'
        selection = re.findall(ur'<.*?>', position)[0]
        for arg in selection.lstrip('<').rstrip('>').split(';'):
            if category:
                arguments.append(u'%s:%s' % (category, arg))
            else:
                arguments.append(arg)
    arguments = sort_arguments(arguments)
    return arguments

def sort_arguments(arguments):
    sorted_arguments = []
    for type in ARG_TYPES_BY_PRIORITY:
        for arg in arguments:
            (arg_type, attributes, category) = arg_from_text_rep(arg)
            if arg_type == type:
                sorted_arguments.append(arg)
    return sorted_arguments

def arg_from_text_rep(argument):
    attributes = []
    category = ''
    if ':' in argument:
        arg_split = argument.split(':')
        category = arg_split[0]
        argument = arg_split[1]
    arg_parts = argument.split('(')
    arg_type = arg_parts[0]
    if len(arg_parts) > 1:
        attributes = arg_parts[1].rstrip(')').replace("'", "").split(',')
    return arg_type, attributes, category

def tokenize_sentence(sentence):
    token_idx = 0
    tokens = []
    chunks = sentence.split('] ')
    for chunk in chunks:
        if chunk.startswith('[[['):
            token = {'idx': token_idx,
                     'orth': '[',
                     'base': '[',
                     'tags': ['interp'],
                     'argument': '',
                     'argument_start': -1,
                     'argument_end': -1,
                     'occupied': False}
        elif chunk.startswith('>'):
            token = {'idx': token_idx,
                     'orth': '>',
                     'base': '>',
                     'tags': ['interp'],
                     'argument': '',
                     'argument_start': -1,
                     'argument_end': -1,
                     'occupied': False}
        else:
            chunk_parts = chunk.split('[')
            (base, tags) = (chunk_parts[1].split('>'))#rstrip(']').)
            orth = chunk_parts[0].lower()
            token = {'idx': token_idx,
                     'orth': orth,
                     'base': base,
                     'tags': tags.split(':'),
                     'argument': '',
                     'argument_start': -1,
                     'argument_end': -1,
                     'occupied': False}
        tokens.append(token)
        token_idx += 1
    return tokens

def case_conversion(case, category):
    if case == 'instr':
        case = 'inst'
    elif case == 'part':
        case = u'gen|acc'
    elif case == 'str' and (category == 'subj' or not category):
        case = 'nom'
    elif case == 'str' and category == 'obj':
        case = 'acc'
    return case

def number_conversion(number):
    if number == '_':
        number = ''
    return number

def aspect_conversion(aspect):
    if aspect == '_':
        aspect = ''
    return aspect

def phrase_type_conversion(phrase_type):
    if phrase_type == u'że':
        phrase_type = u'że|iż'
    elif phrase_type == u'żeby':
        phrase_type = u'żeby|aby|by|iżby|ażeby'
    elif phrase_type == u'żeby2':
        phrase_type = u'że|iż|żeby'  # !!! nie wiem co ma być pod żeby2
    elif phrase_type == u'int':
        phrase_type = u'kiedy|jak|czy' # !!! nie wiem co ma być pod int
    elif phrase_type == u'jakby':
        phrase_type = u'jakby|jak gdyby'
    return phrase_type

def complex_prep_lemma_conversion(lemma):
    if lemma == u'powodu':
        lemma = u'powód'
    elif lemma == u'sprawie':
        lemma = u'sprawa'
    elif lemma == u'kwestii':
        lemma = u'kwestia'
    elif lemma == u'roli':
        lemma = u'rola'
    elif lemma == u'okolicach':
        lemma = u'okolica'
    elif lemma == u'czasie':
        lemma = u'czas'
    elif lemma == u'stronie':
        lemma = u'strona'
    elif lemma == u'początku':
        lemma = u'początek'
    return lemma

def proper_case(token, case):
    possible_cases = [case]
    proper_case = False
    if '|' in case:
        possible_cases = case.split('|')
    if len(set(token['tags']) & set(possible_cases)) == 1:
        proper_case = True
    return proper_case

def get_matching_token(tokens, orth='', base='', case='',
                         number='', phrase_type='', aspect='',
                         degree='', pos=''):
#    print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
#    print 'orth: %s, base: %s, case: %s, number: %s, pos: %s' % (orth,
#                                                                 base,
#                                                                 case,
#                                                                 number,
#                                                                 pos)
    matching_token = None
    for token in tokens:
        match = True
        if token['occupied']:
            continue
        if orth and not token['orth'] == orth:
            match = False
        if base and not token['base'] == base:
            match = False
        if case and not proper_case(token, case):
            match = False
        if number and not number in token['tags']:
            match = False
        if aspect and not aspect in token['tags']:
            match = False
        if degree and not degree in token['tags']:
            match = False
        if pos and not pos in token['tags']:
            match = False
        if match:
            matching_token = token
            break
    return matching_token

def fill_token_data(token, argument, start_idx, end_idx):
    token['argument'] = argument
    token['argument_start'] = start_idx
    token['argument_end'] = end_idx

def mark_fixed(tokens, argument, tresc):
    tresc_idx = 0
    tresc_orths = tresc.split()
    tresc_start = -1
    tresc_end = -1
    for token in tokens:
        if token['occupied']:
            continue
        if token['orth'] == tresc_orths[tresc_idx]:
            tresc_idx += 1
            if tresc_start == -1:
                tresc_start = tokens.index(token)
        else:
            tresc_idx = 0
            tresc_start = -1
        if tresc_idx == len(tresc_orths):
            tresc_end = tokens.index(token)
            break
    for token in tokens[tresc_start:tresc_end+1]:
        fill_token_data(token, argument, tresc_start, tresc_end)
        token['occupied'] = True

def mark_preplexnp(tokens, argument, preposition, case, number, lemma):
    preposition_token = get_matching_token(tokens, orth='', base=preposition,
                                           case=case, number='', pos='prep') # !! case nie powinien być zgodny z lematem??
    start_idx = tokens.index(preposition_token)
    lemma_token = get_matching_token(tokens[start_idx:], orth='', base=lemma,
                                     case=case, number=number, pos='subst')
    end_idx = tokens.index(lemma_token)
    fill_token_data(preposition_token, argument, start_idx, end_idx)
    fill_token_data(lemma_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_comprepnp(tokens, argument, preposition, lemma):
    if preposition == u'co' and lemma == u'do':
        preposition_token = get_matching_token(tokens, orth='co', base='',
                                               case='', number='', pos='subst') # !! czy pos nie powinien byc subst
        start_idx = tokens.index(preposition_token)
        lemma_token = get_matching_token(tokens[start_idx:], orth='do', base='',
                                         case='', number='', pos='prep')
        end_idx = tokens.index(lemma_token)
    else:
        preposition_token = get_matching_token(tokens, orth='', base=preposition,
                                               case='', number='', pos='prep') # !! case nie powinien być zgodny z lematem??
        start_idx = tokens.index(preposition_token)
        lemma_base = complex_prep_lemma_conversion(lemma)
        lemma_token = get_matching_token(tokens[start_idx:], orth='', base=lemma_base,
                                         case='', number='', pos='subst')
        end_idx = tokens.index(lemma_token)
    noun_token = get_matching_token(tokens[end_idx+1:], orth='', base='',
                                    case='', number='', pos='subst') # za proste, glupoty wychodza
    end_idx = tokens.index(noun_token)
    fill_token_data(preposition_token, argument, start_idx, end_idx)
    fill_token_data(lemma_token, argument, start_idx, end_idx)
    fill_token_data(noun_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_prepnp(tokens, argument, preposition, case):
    preposition_token = get_matching_token(tokens, orth='', base=preposition,
                                           case=case, number='', pos='prep') # !! case nie powinien być zgodny z lematem??
    start_idx = tokens.index(preposition_token)
    noun_token = get_matching_token(tokens[start_idx:], orth='', base='',
                                    case=case, number='', pos='subst')
    end_idx = tokens.index(noun_token)
    fill_token_data(preposition_token, argument, start_idx, end_idx)
    fill_token_data(noun_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_phrase(tokens, start_idx, argument, phrase_type):
    for phrase in phrase_type.split('|'):
        phrase_parts = phrase.split()
        if len(phrase_parts) > 1:
            phrase_token1 = get_matching_token(tokens[start_idx+1:], orth='', base=phrase_parts[0],
                                              case='', number='', pos='')
            if phrase_token1:
                phrase_start_idx = tokens.index(phrase_token1)
            phrase_token2 = get_matching_token(tokens[phrase_start_idx+1:], orth='', base=phrase_parts[1],
                                              case='', number='', pos='')
            if phrase_token1 and phrase_token2:
                phrase_end_idx = tokens.index(phrase_token2)
                fill_token_data(phrase_token1, argument, phrase_start_idx, phrase_end_idx)
                fill_token_data(phrase_token2, argument, phrase_start_idx, phrase_end_idx)
                break
        else:
            phrase_token = get_matching_token(tokens[start_idx+1:], base=phrase)
            if phrase_token:
                phrase_end_idx = tokens.index(phrase_token)
                phrase_start_idx = phrase_end_idx
                fill_token_data(phrase_token, argument, phrase_start_idx, phrase_end_idx)
                break
    return phrase_start_idx, phrase_end_idx

def mark_prepncp(tokens, argument, preposition, case, phrase_type):
    preposition_token = get_matching_token(tokens, orth='', base=preposition,
                                           case=case, number='', pos='prep') # !! case nie powinien być zgodny z lematem??
    start_idx = tokens.index(preposition_token)
    noun_token = get_matching_token(tokens[start_idx:], orth='', base='',
                                    case=case, number='', pos='subst')
    end_idx = tokens.index(noun_token)
    xx, end_idx = mark_phrase(tokens, end_idx, argument, phrase_type)
    fill_token_data(preposition_token, argument, start_idx, end_idx)
    fill_token_data(noun_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_prepadjp(tokens, argument, preposition, case):
    preposition_token = get_matching_token(tokens, orth='', base=preposition,
                                           case=case, number='', pos='prep') # !! case nie powinien być zgodny z lematem??
    start_idx = tokens.index(preposition_token)
    adj_token = get_matching_token(tokens[start_idx:], orth='', base='',
                                    case=case, number='', pos='adj')
    end_idx = tokens.index(adj_token)
    fill_token_data(preposition_token, argument, start_idx, end_idx)
    fill_token_data(adj_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_lexnp(tokens, argument, case, number, lemma):
    lemma_token = get_matching_token(tokens, orth='', base=lemma,
                                     case=case, number=number, pos='subst')
    start_idx = tokens.index(lemma_token)
    end_idx = start_idx
    fill_token_data(lemma_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_np(tokens, argument, case):
    noun_token = get_matching_token(tokens, orth='', base='',
                                    case=case, number='', pos='subst')
    start_idx = tokens.index(noun_token)
    end_idx = start_idx
    fill_token_data(noun_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_ncp(tokens, argument, case, phrase_type):
    noun_token = get_matching_token(tokens, orth='', base='',
                                    case=case, number='', pos='subst')
    start_idx = tokens.index(noun_token)
    xx, end_idx = mark_phrase(tokens, start_idx, argument, phrase_type)
    fill_token_data(noun_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_cp(tokens, argument, phrase_type):
    start_idx, end_idx = mark_phrase(tokens, -1, argument, phrase_type)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_adjp(tokens, argument, case):
    adj_token = get_matching_token(tokens, case=case, pos='adj')
    start_idx = tokens.index(adj_token)
    end_idx = start_idx
    fill_token_data(adj_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_infp(tokens, argument, aspect):
    inf_token = get_matching_token(tokens, orth='', base='',
                                   case='', number='', aspect=aspect, pos='inf')
    start_idx = tokens.index(inf_token)
    end_idx = start_idx
    fill_token_data(inf_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_lemma(tokens, argument, lemma, sie, aspect):
    lemma_token = get_matching_token(tokens, orth='', base=lemma,
                                     case='', number='', aspect=aspect,
                                     pos='')
    start_idx = tokens.index(lemma_token)
    if sie:
        sie_token = get_matching_token(tokens[start_idx:], orth='', base=u'się',
                                        case='', number='', pos='')
        end_idx = tokens.index(sie_token)
        fill_token_data(sie_token, argument, start_idx, end_idx)
    else:
        end_idx = start_idx
    fill_token_data(lemma_token, argument, start_idx, end_idx)

    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_nonch(tokens, argument, nonch):
    for pronoun in nonch.split('|'):
        pronoun_parts = pronoun.split()
        if len(pronoun_parts) > 1:
            matched_tokens = []
            parts_matched = True
            pronoun_start_idx = 0
            for pronoun_part in pronoun_parts:
                pronoun_token = get_matching_token(tokens[pronoun_start_idx+1:], orth='', base=pronoun_part,
                                                   case='', number='', pos='')
                if pronoun_token:
                    pronoun_start_idx = tokens.index(pronoun_token)
                    matched_tokens.append(pronoun_token)
                else:
                    parts_matched = False
                    break
            if parts_matched:
                start_idx = tokens.index(matched_tokens[0])
                end_idx = tokens.index(matched_tokens[-1])
                for token in matched_tokens:
                    fill_token_data(token, argument, start_idx, end_idx)
                break
        else:
            pronoun_token = get_matching_token(tokens, orth='', base=pronoun,
                                              case='', number='', pos='')
            if pronoun_token:
                start_idx = tokens.index(pronoun_token)
                end_idx = start_idx
                fill_token_data(pronoun_token, argument, start_idx, end_idx)
                break
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def mark_advp(tokens, argument, advp_type):
    if advp_type == 'pron':
        possible_bases = ['tak', 'jak']
        for base in possible_bases:
            advp_token = get_matching_token(tokens, base=base, pos='adv')
            if advp_token:
                break
    elif advp_type == 'misc':
        possible_degrees = ['com', 'sup']
        for degree in possible_degrees:
            advp_token = get_matching_token(tokens, degree=degree, pos='adv')
            if advp_token:
                break
    start_idx = tokens.index(advp_token)
    end_idx = start_idx
    fill_token_data(advp_token, argument, start_idx, end_idx)
    for token in tokens[start_idx:end_idx+1]:
        token['occupied'] = True

def count_occupied(tokens):
    occupied_tokens = [token for token in tokens if token['occupied']]
    return len(occupied_tokens)

def mark_arg_in_sentence(argument, sentence_tokens):
    (arg_type, attributes, category) = arg_from_text_rep(argument)
    if arg_type == 'fixed':
        mark_fixed(sentence_tokens, argument, attributes[0])
    elif arg_type == 'preplexnp':
        preposition = attributes[0]
        case = case_conversion(attributes[1], category)
        number = number_conversion(attributes[2])
        lemma = attributes[3]
        mark_preplexnp(sentence_tokens, argument, preposition, case, number, lemma)
    elif arg_type == 'comprepnp':
        complex_preposition_parts = attributes[0].split()
        preposition = complex_preposition_parts[0]
        lemma = complex_preposition_parts[1]
        mark_comprepnp(sentence_tokens, argument, preposition, lemma)
    elif arg_type == 'prepnp':
        preposition = attributes[0]
        case = case_conversion(attributes[1], category)
        mark_prepnp(sentence_tokens, argument, preposition, case)
    elif arg_type == 'prepncp':
        preposition = attributes[0]
        case = case_conversion(attributes[1], category)
        phrase_type = phrase_type_conversion(attributes[2])
        mark_prepncp(sentence_tokens, argument, preposition, case, phrase_type)
    elif arg_type == 'prepadjp':
        preposition = attributes[0]
        case = case_conversion(attributes[1], category)
        mark_prepadjp(sentence_tokens, argument, preposition, case)
    elif arg_type == 'lexnp':
        case = case_conversion(attributes[0], category)
        number = number_conversion(attributes[1])
        lemma = attributes[2]
        mark_lexnp(sentence_tokens, argument, case, number, lemma)
    elif arg_type == 'np':
        case = case_conversion(attributes[0], category)
        mark_np(sentence_tokens, argument, case)
    elif arg_type == 'ncp':
        case = case_conversion(attributes[0], category)
        phrase_type = phrase_type_conversion(attributes[1])
        mark_ncp(sentence_tokens, argument, case, phrase_type)
    elif arg_type == 'cp':
        phrase_type = phrase_type_conversion(attributes[0])
        mark_cp(sentence_tokens, argument, phrase_type)
    elif arg_type == 'adjp':
        case = case_conversion(attributes[0], category)
        mark_adjp(sentence_tokens, argument, case)
    elif arg_type == 'infp':
        aspect = aspect_conversion(attributes[0])
        mark_infp(sentence_tokens, argument, aspect)
    elif arg_type == u'nonch':
        nonch = u'co|coś|nic|to|to samo co'
        mark_nonch(sentence_tokens, argument, nonch)
    elif arg_type == 'lemma':
        lemma = attributes[0]
        sie = attributes[1]
        aspect = aspect_conversion(attributes[2])
        mark_lemma(sentence_tokens, argument, lemma, sie, aspect)
    elif arg_type == 'advp':
        advp_type = attributes[0]
        mark_advp(sentence_tokens, argument, advp_type)
#    elif arg_type == 'xp':
#        argument_obj = Argument.objects.get(text_rep=argument)
#        realizations = [realization.argument.text_rep for realization in argument_obj.realizations.all()]
#        start_occupacy = count_occupied(sentence_tokens)
#        for realization in sort_arguments(realizations):
#            mark_arg_in_sentence(realization, sentence_tokens)
#            if count_occupied(sentence_tokens) > start_occupacy:
#                break


def cut_sentence_chunks(sentence_tokens):
    endpoint = -1
    ignore = False
    sentence_chunks = []
    for token in sentence_tokens:
        if token['argument'] and not ignore:
            orths = [tok['orth'] for tok in sentence_tokens[token['argument_start']:token['argument_end']+1] if tok['argument']]
            arg_realization = u'%s (%s)' % (u' '.join(orths), token['argument'])
            endpoint = token['argument_end']
            sentence_chunks.append(arg_realization)
            ignore = True
        if token['idx'] == endpoint:
            ignore = False
    return u' '.join(sentence_chunks)

def get_sentence_chunk(arguments, sentence_tokens):
    for arg in arguments:
        mark_arg_in_sentence(arg, sentence_tokens)
    return cut_sentence_chunks(sentence_tokens)

def create_lemma_argument(lemma_entry, frame_text_rep):
    frame_parts = frame_text_rep.split(':')
    sie = frame_parts[0]
    aspect = frame_parts[2]
    frame_structure = frame_parts[3]
    if not sie and u'refl' in frame_structure:
        sie = u'się'
    argument = u'lemma(%s,%s,%s)' % (lemma_entry, sie, aspect)
    return argument

def get_arguments_coverage():
    try:
        first_line = True
        examples_file = codecs.open(os.path.join(BASE_PATH,
                                                 'detailed_examples_v2.csv'), 'rt', 'utf-8')
        output_file = codecs.open(os.path.join(BASE_PATH,
                                                 'detailed_examples_cover_v2.csv'), 'wt', 'utf-8')
        output_file.write(u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % LABELS)
        for line in examples_file:
            if first_line:
                first_line = False
                continue
            if 'Error!!!' in line:
                continue
            line = line.strip()
            example_data = line.split('\t')
            lemma_entry = example_data[0]
            lemma_status = example_data[1]
            frame_text_rep = example_data[2]
            frame_opinion = example_data[3]
            sentence = example_data[4]
            tagged_sentence = example_data[5]
            example_opinion = example_data[6]
            example_source = example_data[7]
            arguments_selection = example_data[8]
            if not tagged_sentence:
                sentence_chunk = u'Error!!! Błąd tagowania.'
            else:
#                print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
#                print sentence
                lemma_argument = create_lemma_argument(lemma_entry, frame_text_rep)
                arguments = [lemma_argument]
                arguments.extend(get_arguments(arguments_selection))
                sentence_tokens = tokenize_sentence(tagged_sentence)
                try:
                    sentence_chunk = get_sentence_chunk(arguments, sentence_tokens)
                except:
                    sentence_chunk = u'Error!!! Nie dopasowano wszystkich argumentów.'
            output_file.write(u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (lemma_entry,
                                                                             lemma_status,
                                                                             frame_text_rep,
                                                                             frame_opinion,
                                                                             sentence,
                                                                             tagged_sentence,
                                                                             sentence_chunk,
                                                                             example_opinion,
                                                                             example_source,
                                                                             arguments_selection))
    finally:
        examples_file.close()
        output_file.close()

def get_examples():
    q_statuses = []
    for status in STATUSES_LS:
        q_statuses.append(Q(status__status=status))
    write_detailed_examples(q_statuses)
#    write_examples(q_statuses)
#    get_arguments_coverage()