#-*- coding:utf-8 -*- from semantics.models import LexicalUnitExamples import datetime from django.db.models import Count from lxml import etree from xml.sax.saxutils import escape from dictionary.models import Argument, Atribute_Model, WalentyStat, \ sortArguments, sortatributes, sortPositions, sort_positions XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace' def createteixml(outpath, lemmas, frame_opinion_values): root = write_root() write_header(root) write_entries(root, lemmas, frame_opinion_values) with open(outpath, 'w') as output_file: output_file.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='UTF-8', doctype=u'<!DOCTYPE TEI SYSTEM "tei_all.dtd">')) def write_root(): root = etree.Element('TEI') root.attrib[etree.QName(XML_NAMESPACE, 'lang')] = u'pl' root.attrib['xmlns'] = u'http://www.tei-c.org/ns/1.0' return root def write_header(root, extensions_file=False): tei_header = etree.SubElement(root, 'teiHeader') file_desc = etree.SubElement(tei_header, 'fileDesc') title_stmt = etree.SubElement(file_desc, 'titleStmt') title = etree.SubElement(title_stmt, 'title') if extensions_file: title.text = u'This file is part of Walenty: a valence dictionary of Polish (http://zil.ipipan.waw.pl/Walenty)' else: title.text = u'Walenty: a valence dictionary of Polish (http://zil.ipipan.waw.pl/Walenty)' publication_stmt = etree.SubElement(file_desc, 'publicationStmt') publisher = etree.SubElement(publication_stmt, 'publisher') publisher.text = u'Institute of Computer Science, Polish Academy of Sciences (IPI PAN)' date = etree.SubElement(publication_stmt, 'date') date.attrib['when'] = datetime.datetime.now().strftime('%Y-%m-%d') write_license_elem(publication_stmt) source_desc = etree.SubElement(file_desc, 'sourceDesc') p = etree.SubElement(source_desc, 'p') p.text = u'File generated using Slowal. Mentioned tool available here: http://zil.ipipan.waw.pl/Slowal.' def write_license_elem(parent_elem): availability = etree.SubElement(parent_elem, 'availability') licence = etree.SubElement(availability, 'licence') licence.attrib['target'] = u'http://creativecommons.org/licenses/by-sa/4.0/' p = etree.SubElement(licence, 'p') p.text = u'(C) Copyright 2012–2017 by the Institute of Computer Science, Polish Academy of Sciences (IPI PAN)' p = etree.SubElement(licence, 'p') p.text = u'This work is distributed under a CC BY-SA license: http://creativecommons.org/licenses/by-sa/4.0/' p = etree.SubElement(licence, 'p') p.text = u'Walenty is a valence dictionary of Polish developed at the Institute of Computer Science, Polish Academy of Sciences (IPI PAN). It currently contains %s schemata and %s frames for %s lemmata.' % (WalentyStat.objects.get(label=u'Łączna liczba schematów').value, WalentyStat.objects.get(label=u'Łączna liczba ram semantycznych').value, WalentyStat.objects.get(label=u'Łączna liczba haseł').value) p = etree.SubElement(licence, 'p') p.text = u'The original formalism of Walenty was established by Filip Skwarski, Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski, Marcin Woliński, Marek Świdziński, and Magdalena Zawisławska. It has been further developed by Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski, and Marcin Woliński. The semantic layer has been developed by Elżbieta Hajnicz and Anna Andrzejczuk.' p = etree.SubElement(licence, 'p') p.text = u'The original seed of Walenty was provided by the automatic conversion, manually reviewed by Filip Skwarski, of the verbal valence dictionary used by the Świgra2 parser (6396 schemata for 1462 lemmata), which was in turn based on SDPV, the Syntactic Dictionary of Polish Verbs by Marek Świdziński (4148 schemata for 1064 lemmata). Afterwards, Walenty has been developed independently by adding new entries, syntactic schemata, in particular phraseological ones, and semantic frames.' p = etree.SubElement(licence, 'p') p.text = u'Walenty has been edited and compiled using the Slowal tool (http://zil.ipipan.waw.pl/Slowal) created by Bartłomiej Nitoń and Tomasz Bartosiak.' def write_entries(root, lemmas, frame_opinion_values): text = etree.SubElement(root, 'text') body = etree.SubElement(text, 'body') for lemma in lemmas: frame_opinions = lemma.frame_opinions.filter(value__in=frame_opinion_values) write_entry(body, lemma, frame_opinions, frame_opinion_values) def write_entry(body_elem, lemma, frame_opinions, frame_opinion_values): entry_xml_id = 'wal_%s-ent' % str(lemma.entry_obj.id) entry_elem = etree.SubElement(body_elem, 'entry') entry_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = entry_xml_id form_elem = etree.SubElement(entry_elem, 'form') orth_elem = etree.SubElement(form_elem, 'orth') orth_elem.text = lemma.entry pos_elem = etree.SubElement(form_elem, 'pos') pos_elem.text = lemma.entry_obj.pos.tag write_status_info(entry_elem, lemma) write_syntactic_layer(entry_elem, lemma, frame_opinions, frame_opinion_values) write_examples_layer(entry_elem, lemma) if lemma.semantics_ready(): write_semantic_layer(entry_elem, lemma) write_meanings_layer(entry_elem, lemma) write_connections_layer(entry_elem, lemma) def write_status_info(parent_elem, lemma): general_fs_elem = etree.SubElement(parent_elem, 'fs') general_fs_elem.attrib['type'] = 'general_info' status_f_elem = etree.SubElement(general_fs_elem, 'f') status_f_elem.attrib['name'] = 'status' status_string = etree.SubElement(status_f_elem, 'string') status_string.text = unicode(lemma.status) def write_syntactic_layer(entry_elem, lemma, frame_opinions, frame_opinion_values): synt_layer_fs_elem = etree.SubElement(entry_elem, 'fs') synt_layer_fs_elem.attrib['type'] = 'syntactic_layer' schemata_f_elem = etree.SubElement(synt_layer_fs_elem, 'f') schemata_f_elem.attrib['name'] = 'schemata' vColl_elem = etree.SubElement(schemata_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for reflex_val in lemma.get_existing_frame_char_values(u'ZWROTNOŚĆ'): for neg_val in lemma.get_existing_frame_char_values(u'NEGATYWNOŚĆ'): for pred_val in lemma.get_existing_frame_char_values(u'PREDYKATYWNOŚĆ'): for aspect_val in lemma.get_existing_frame_char_values(u'ASPEKT'): matchingframes = lemma.get_frames_by_char_values(reflex_val=reflex_val, neg_val=neg_val, pred_val=pred_val, aspect_val=aspect_val).order_by('text_rep') for frame in matchingframes: if not lemma.phraseology_ready() and frame.phraseologic: continue if (not frame_opinion_values.exists() or frame_opinions.filter(frame=frame).exists()): write_schema(vColl_elem, frame, lemma) def write_schema(parent_elem, schema, lemma): schema_xml_id = 'wal_%s.%s-sch' % (str(lemma.entry_obj.id), str(schema.id)) schema_fs_elem = etree.SubElement(parent_elem, 'fs') schema_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = schema_xml_id schema_fs_elem.attrib['type'] = 'schema' # reprezentacja tekstowa text_rep_f_elem = etree.SubElement(schema_fs_elem, 'f') text_rep_f_elem.attrib['name'] = 'text_rep' text_rep_string = etree.SubElement(text_rep_f_elem, 'string') text_rep = schema.get_position_spaced_text_rep() if schema.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists(): text_rep = ' ' + text_rep text_rep_string.text = lemma.entry_obj.name + text_rep.replace(':',': ') # opinia o schemacie try: schema_opinion = lemma.frame_opinions.filter(frame=schema).all()[0].value.short except IndexError: schema_opinion = 'unk' opinion_f_elem = etree.SubElement(schema_fs_elem, 'f') opinion_f_elem.attrib['name'] = 'opinion' opinion_symbol = etree.SubElement(opinion_f_elem, 'symbol') opinion_symbol.attrib['value'] = schema_opinion # zwrotnosc reflex = schema.characteristics.get(type=u'ZWROTNOŚĆ') selfmark_f_elem = etree.SubElement(schema_fs_elem, 'f') selfmark_f_elem.attrib['name'] = 'inherent_sie' selfmark_binary = etree.SubElement(selfmark_f_elem, 'binary') if reflex.value.value: selfmark_binary.attrib['value'] = 'true' else: selfmark_binary.attrib['value'] = 'false' # aspekt aspect = schema.characteristics.get(type=u'ASPEKT').value.value aspect_f_elem = etree.SubElement(schema_fs_elem, 'f') aspect_f_elem.attrib['name'] = 'aspect' if aspect: aspect_symbol = etree.SubElement(aspect_f_elem, 'symbol') aspect_symbol.attrib['value'] = aspect # negatywnosc negativity = schema.characteristics.get(type=u'NEGATYWNOŚĆ').value.value negativity_f_elem = etree.SubElement(schema_fs_elem, 'f') negativity_f_elem.attrib['name'] = 'negativity' if negativity: negativity_symbol = etree.SubElement(negativity_f_elem, 'symbol') negativity_symbol.attrib['value'] = negativity # predykatywnosc predicativity = schema.characteristics.get(type=u'PREDYKATYWNOŚĆ').value.value predicativity_f_elem = etree.SubElement(schema_fs_elem, 'f') predicativity_f_elem.attrib['name'] = 'predicativity' predicativity_binary = etree.SubElement(predicativity_f_elem, 'binary') if predicativity: predicativity_binary.attrib['value'] = 'true' else: predicativity_binary.attrib['value'] = 'false' # pozycje składniowe write_positions_feature(schema, schema_xml_id, schema_fs_elem) def write_positions_feature(schema, schema_xml_id, parent_elem): sorted_pos_dict = sortPositions(schema.positions.all()) if sorted_pos_dict: positions_f_elem = etree.SubElement(parent_elem, 'f') positions_f_elem.attrib['name'] = 'positions' vColl_elem = etree.SubElement(positions_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for position in sorted_pos_dict: write_position_elem(vColl_elem, schema_xml_id, position['position']) def write_position_elem(parent_elem, schema_xml_id, position): position_xml_id = None position_fs_elem = etree.SubElement(parent_elem, 'fs') if schema_xml_id: position_xml_id = schema_xml_id.replace(u'-sch', '.%d-psn' % position.id) position_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = position_xml_id position_fs_elem.attrib['type'] = 'position' functions = position.categories.filter(control=False) if functions.exists(): function_f_elem = etree.SubElement(position_fs_elem, 'f') function_f_elem.attrib['name'] = 'function' function_symbol_elem = etree.SubElement(function_f_elem, 'symbol') function_symbol_elem.attrib['value'] = functions[0].category write_control_features(position_fs_elem, position) write_phrases_feature(position_fs_elem, position, position_xml_id) def write_control_features(parent_elem, position): controls1 = position.categories.filter(control=True).exclude(category__endswith='2') controls2 = position.categories.filter(control=True, category__endswith='2') if controls1.exists() or controls2.exists(): control_f_elem = etree.SubElement(parent_elem, 'f') control_f_elem.attrib['name'] = 'control' vColl_elem = etree.SubElement(control_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' if controls1.exists(): control = controls1[0].category control1_symbol_elem = etree.SubElement(vColl_elem, 'symbol') control1_symbol_elem.attrib['value'] = control if controls2.exists(): control = controls2[0].category control2_symbol_elem = etree.SubElement(vColl_elem, 'symbol') control2_symbol_elem.attrib['value'] = control def write_phrases_feature(parent_elem, position, position_xml_id): sorted_phrases = sortArguments(position.arguments.all()) if sorted_phrases: phrases_f_elem = etree.SubElement(parent_elem, 'f') phrases_f_elem.attrib['name'] = 'phrases' vColl_elem = etree.SubElement(phrases_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for phrase in sorted_phrases: write_phrase(vColl_elem, phrase, position_xml_id) def write_phrase(parent_elem, phrase, position_xml_id, write_expansions_id=True): phrase_fs_elem = etree.SubElement(parent_elem, 'fs') if position_xml_id: phrase_xml_id = position_xml_id.replace(u'-psn', '.%d-phr' % phrase.id) phrase_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = phrase_xml_id phrase_fs_elem.attrib['type'] = phrase.type if phrase.realizations.exists() and write_expansions_id: write_expansions_link(phrase_fs_elem, phrase) write_attributes(phrase_fs_elem, phrase) def write_expansions_link(parent_elem, phrase): expansions_f_elem = etree.SubElement(parent_elem, 'f') expansions_f_elem.attrib['name'] = 'expansions' expansions_link_elem = etree.SubElement(expansions_f_elem, 'fs') expansions_link_elem.attrib['sameAs'] = '#wal_%d-exp' % phrase.id #expansions_link_elem.attrib['type'] = 'phrase_type_expansions' def write_attributes(parent_elem, phrase): attributes = sortatributes(phrase) for attribute in attributes: write_attribute(parent_elem, attribute) def write_attribute(parent_elem, attribute): attribute_model = Atribute_Model.objects.get(atr_model_name=attribute.type) attr_f_elem = etree.SubElement(parent_elem, 'f') attr_f_elem.attrib['name'] = attribute_model.sym_name attribute_type = attribute_model.type.sym_name selection_modes = attribute_model.values_selection_modes if attribute_type == 'text' and not selection_modes.exists(): write_simple_text_attr(attr_f_elem, attribute) elif attribute_type == 'text' and selection_modes.exists(): write_complex_text_attr(attr_f_elem, attribute_model, attribute) elif attribute_type == 'parameter' and not selection_modes.exists(): write_simple_parameter_attr(attr_f_elem, attribute_model, attribute) elif attribute_type == 'parameter' and selection_modes.exists(): write_complex_parameter_attr(attr_f_elem, attribute_model, attribute) elif attribute_type == 'argument' and not selection_modes.exists(): write_simple_phrase_type_attr(attr_f_elem, attribute) elif attribute_type == 'argument' and selection_modes.exists(): write_complex_phrase_type_attr(attr_f_elem, attribute) elif attribute_type == 'position': write_complex_position_attr(attr_f_elem, attribute) def write_simple_text_attr(parent_elem, attribute): string_elem = etree.SubElement(parent_elem, 'string') string_elem.text = unicode(attribute).strip("'") def write_complex_text_attr(parent_elem, attribute_model, attribute): complex_lemma_fs_elem = etree.SubElement(parent_elem, 'fs') complex_lemma_fs_elem.attrib['type'] = '%s_def' % parent_elem.attrib['name'] write_selection_mode_and_separator(complex_lemma_fs_elem, attribute_model, attribute) write_lemmas(complex_lemma_fs_elem, attribute) def write_selection_mode_and_separator(parent_elem, attr_model, attribute): if attribute.selection_mode: selection_mode = attribute.selection_mode.name else: selection_mode = attr_model.values_selection_modes.order_by('priority')[0].name sel_mode_f_elem = etree.SubElement(parent_elem, 'f') sel_mode_f_elem.attrib['name'] = 'selection_mode' sel_mode_symbol_elem = etree.SubElement(sel_mode_f_elem, 'symbol') sel_mode_symbol_elem.attrib['value'] = selection_mode if attribute.separator: separator = attribute.separator.symbol else: separator = attr_model.value_separators.order_by('priority')[0].symbol if separator == ';': separator = 'coord' elif separator == ',': separator = 'concat' separator_f_elem = etree.SubElement(parent_elem, 'f') separator_f_elem.attrib['name'] = 'cooccurrence' separator_symbol_elem = etree.SubElement(separator_f_elem, 'symbol') separator_symbol_elem.attrib['value'] = separator def write_lemmas(parent_elem, attribute): lemmas = [unicode(value) for value in attribute.values.order_by('text')] lemmas_f_elem = etree.SubElement(parent_elem, 'f') lemmas_f_elem.attrib['name'] = 'lemmas' vColl_elem = etree.SubElement(lemmas_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for lemma in lemmas: string_elem = etree.SubElement(vColl_elem, 'string') string_elem.text = lemma.strip("'") def write_simple_parameter_attr(parent_elem, attribute_model, attribute): param_value = attribute.values.all()[0] write_parameter(parent_elem, attribute_model, param_value) def write_parameter(parent_elem, attribute_model, param_value): if attribute_model.use_subparams(): param_fs_elem = etree.SubElement(parent_elem, 'fs') param_fs_elem.attrib['type'] = '%s_def' % parent_elem.attrib['name'] value_f_elem = etree.SubElement(param_fs_elem, 'f') value_f_elem.attrib['name'] = 'conjunction' value = param_value.parameter.type.name symbol_elem = etree.SubElement(value_f_elem, 'symbol') symbol_elem.attrib['value'] = value if param_value.parameter.subparameters.exists(): write_parameter_subparameters(param_fs_elem, param_value.parameter) else: value = unicode(param_value) if attribute_model.sym_name == 'reflex': selfmark_binary = etree.SubElement(parent_elem, 'binary') if value: selfmark_binary.attrib['value'] = 'true' else: selfmark_binary.attrib['value'] = 'false' elif attribute_model.sym_name == 'complex_preposition': string_elem = etree.SubElement(parent_elem, 'string') string_elem.text = value elif value: symbol_elem = etree.SubElement(parent_elem, 'symbol') symbol_elem.attrib['value'] = value def write_parameter_subparameters(parent_elem, parameter): subparams_f_elem = etree.SubElement(parent_elem, 'f') subparams_f_elem.attrib['name'] = 'constraints' vColl_elem = etree.SubElement(subparams_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for subparameter in parameter.subparameters.order_by('name'): write_subparameter(vColl_elem, subparameter) def write_subparameter(parent_elem, subparameter): symbol_elem = etree.SubElement(parent_elem, 'symbol') symbol_elem.attrib['value'] = subparameter.name def write_complex_parameter_attr(parent_elem, attribute_model, attribute): vColl_elem = etree.SubElement(parent_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for value in attribute.values.order_by('parameter__type'): write_parameter(vColl_elem, attribute_model, value) def write_simple_phrase_type_attr(parent_elem, attribute): write_phrase(parent_elem, attribute.values.all()[0].argument, None) def write_complex_phrase_type_attr(parent_elem, attribute): selection_mode = attribute.selection_mode if selection_mode.sym_name == 'list': phrases = [value.argument for value in attribute.values.all()] write_phrases_set(parent_elem, phrases) else: complex_phrase_fs_elem = etree.SubElement(parent_elem, 'fs') complex_phrase_fs_elem.attrib['type'] = '%s_def' % parent_elem.attrib['name'] write_typed_phrase_attr(complex_phrase_fs_elem, attribute) def write_phrases_set(parent_elem, phrases): vColl_elem = etree.SubElement(parent_elem, 'vColl') vColl_elem.attrib['org'] = 'set' sorted_phrases = sortArguments(phrases) for phrase in sorted_phrases: write_phrase(vColl_elem, phrase, None) def write_typed_phrase_attr(parent_elem, attribute): selection_mode = attribute.selection_mode type_f_elem = etree.SubElement(parent_elem, 'f') type_f_elem.attrib['name'] = 'name' symbol_elem = etree.SubElement(type_f_elem, 'symbol') symbol_elem.attrib['value'] = selection_mode.name if attribute.values.exists(): phrases_f_elem = etree.SubElement(parent_elem, 'f') phrases_f_elem.attrib['name'] = 'constraints' phrases = [value.argument for value in attribute.values.all()] write_phrases_set(phrases_f_elem, phrases) def write_complex_position_attr(parent_elem, attribute): complex_positions_fs_elem = etree.SubElement(parent_elem, 'fs') complex_positions_fs_elem.attrib['type'] = '%s_def' % parent_elem.attrib['name'] selection_mode = attribute.selection_mode type_f_elem = etree.SubElement(complex_positions_fs_elem, 'f') type_f_elem.attrib['name'] = 'type' symbol_elem = etree.SubElement(type_f_elem, 'symbol') symbol_elem.attrib['value'] = selection_mode.name if attribute.values.exists(): positions_f_elem = etree.SubElement(complex_positions_fs_elem, 'f') positions_f_elem.attrib['name'] = 'positions' vColl_elem = etree.SubElement(positions_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' positions = [value.position for value in attribute.values.all()] sorted_positions = sort_positions(positions) for position in sorted_positions: write_position_elem(vColl_elem, None, position) def write_examples_layer(parent_elem, lemma): examples_layer_elem = etree.SubElement(parent_elem, 'fs') examples_layer_elem.attrib['type'] = 'examples_layer' examples_f_elem = etree.SubElement(examples_layer_elem, 'f') examples_f_elem.attrib['name'] = 'examples' vColl_elem = etree.SubElement(examples_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' write_examples_feature(vColl_elem, lemma) def write_examples_feature(parent_elem, lemma): for example in lemma.nkjp_examples.order_by('opinion__priority').all(): if not lemma.phraseology_ready() and example.frame.phraseologic: pass else: write_example(parent_elem, lemma, example) for example in lemma.lemma_nkjp_examples.order_by('opinion__priority').all(): write_example(parent_elem, lemma, example) def write_example(parent_elem, lemma, example): entry = lemma.entry_obj example_xml_id = u'wal_%s.%s-exm' % (str(entry.id), str(example.id)) example_fs_elem = etree.SubElement(parent_elem, 'fs') example_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = example_xml_id example_fs_elem.attrib['type'] = 'example' if lemma.semantics_ready(): get_and_write_meaning_link(example_fs_elem, entry, example) write_phrases_links(example_fs_elem, entry, example) sentence_f_elem = etree.SubElement(example_fs_elem, 'f') sentence_f_elem.attrib['name'] = 'sentence' sentence_content_elem = etree.SubElement(sentence_f_elem, 'string') sentence_content_elem.text = escape(example.sentence) source_f_elem = etree.SubElement(example_fs_elem, 'f') source_f_elem.attrib['name'] = 'source' source_symbol_elem = etree.SubElement(source_f_elem, 'symbol') source_symbol_elem.attrib['value'] = example.source.sym_name opinion_f_elem = etree.SubElement(example_fs_elem, 'f') opinion_f_elem.attrib['name'] = 'opinion' opinion_symbol_elem = etree.SubElement(opinion_f_elem, 'symbol') opinion_symbol_elem.attrib['value'] = example.opinion.opinion if example.comment: note_f_elem = etree.SubElement(example_fs_elem, 'f') note_f_elem.attrib['name'] = 'note' note_content_elem = etree.SubElement(note_f_elem, 'string') note_content_elem.text = escape(example.comment) def get_and_write_meaning_link(parent_elem, entry, example): try: entry_lex_units = entry.meanings.all() lex_unit_example = LexicalUnitExamples.objects.get(example=example, lexical_unit__in=entry_lex_units) meaning = lex_unit_example.lexical_unit meaning_xml_id = u'#wal_%s.%s-mng' % (str(entry.id), str(meaning.id)) meaning_f_elem = etree.SubElement(parent_elem, 'f') meaning_f_elem.attrib['name'] = 'meaning' meaning_link_elem = etree.SubElement(meaning_f_elem, 'fs') meaning_link_elem.attrib['sameAs'] = meaning_xml_id meaning_link_elem.attrib['type'] = 'lexical_unit' except LexicalUnitExamples.DoesNotExist: pass def write_phrases_links(parent_elem, entry, example): phrases_f_elem = etree.SubElement(parent_elem, 'f') phrases_f_elem.attrib['name'] = 'phrases' vColl_elem = etree.SubElement(phrases_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for phrase_selection in example.arguments.all(): create_and_write_phrase_link(vColl_elem, entry, example, phrase_selection) def create_and_write_phrase_link(parent_elem, entry, example, phrase_selection): link_base = u'#wal_%d.%d.%d.' % (entry.id, example.frame.id, phrase_selection.position.id) for phrase in phrase_selection.arguments.all(): link_end = u'%d-phr' % phrase.id link = link_base + link_end phrase_link_elem = etree.SubElement(parent_elem, 'fs') phrase_link_elem.attrib['sameAs'] = link phrase_link_elem.attrib['type'] = 'phrase' def write_semantic_layer(parent_elem, lemma): semantic_layer_elem = etree.SubElement(parent_elem, 'fs') semantic_layer_elem.attrib['type'] = 'semantic_layer' frames_f_elem = etree.SubElement(semantic_layer_elem, 'f') frames_f_elem.attrib['name'] = 'frames' vColl_elem = etree.SubElement(frames_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' write_frames(vColl_elem, lemma) def write_frames(parent_elem, lemma): entry = lemma.entry_obj frames = entry.actual_frames() for frame in frames: write_frame_fs(parent_elem, entry, frame) def write_frame_fs(parent_elem, entry, frame): frame_xml_id = u'wal_%d.%d-frm' % (entry.id, frame.id) frame_fs_elem = etree.SubElement(parent_elem, 'fs') frame_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = frame_xml_id frame_fs_elem.attrib['type'] = 'frame' write_frame_opinion(frame_fs_elem, frame) write_frame_meanings(frame_fs_elem, entry, frame) write_frame_arguments(frame_fs_elem, entry, frame) def write_frame_opinion(parent_elem, frame): frame_opinion = 'unk' if frame.opinion_selected(): frame_opinion = frame.opinion.short opinion_f_elem = etree.SubElement(parent_elem, 'f') opinion_f_elem.attrib['name'] = 'opinion' opinion_symbol = etree.SubElement(opinion_f_elem, 'symbol') opinion_symbol.attrib['value'] = frame_opinion def write_frame_meanings(parent_elem, entry, frame): meanings_f_elem = etree.SubElement(parent_elem, 'f') meanings_f_elem.attrib['name'] = 'meanings' vColl_elem = etree.SubElement(meanings_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for meaning in frame.lexical_units.all(): write_frame_meaning_link(vColl_elem, entry, meaning) def write_frame_meaning_link(parent_elem, entry, meaning): link = u'#wal_%d.%d-mng' % (entry.id, meaning.id) lex_unit_link_elem = etree.SubElement(parent_elem, 'fs') lex_unit_link_elem.attrib['sameAs'] = link lex_unit_link_elem.attrib['type'] = 'lexical_unit' def write_frame_arguments(parent_elem, entry, frame): arguments_f_elem = etree.SubElement(parent_elem, 'f') arguments_f_elem.attrib['name'] = 'arguments' vColl_elem = etree.SubElement(arguments_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for arg in frame.complements.all(): write_frame_argument(vColl_elem, entry, frame, arg) def write_frame_argument(parent_elem, entry, frame, arg): arg_base_id = u'wal_%d.%d' % (entry.id, frame.id) arg_xml_id = arg_base_id + u'.%d-arg' % arg.id argument_fs_elem = etree.SubElement(parent_elem, 'fs') argument_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = arg_xml_id argument_fs_elem.attrib['type'] = 'argument' write_roles(argument_fs_elem, arg) write_selective_preferences(argument_fs_elem, arg, arg_base_id) def write_roles(parent_elem, arg): for role in arg.roles.order_by('gradient'): if role.gradient: attribute_f_elem = etree.SubElement(parent_elem, 'f') attribute_f_elem.attrib['name'] = 'role_attribute' attribute_symbol_elem = etree.SubElement(attribute_f_elem, 'symbol') attribute_symbol_elem.attrib['value'] = unicode(role) else: role_f_elem = etree.SubElement(parent_elem, 'f') role_f_elem.attrib['name'] = 'role' role_symbol_elem = etree.SubElement(role_f_elem, 'symbol') role_symbol_elem.attrib['value'] = unicode(role) def write_selective_preferences(parent_elem, arg, arg_base_id): if(arg.selective_preference): sel_prefs_f_elem = etree.SubElement(parent_elem, 'f') sel_prefs_f_elem.attrib['name'] = 'sel_prefs' sel_prefs_groups_fs_elem = etree.SubElement(sel_prefs_f_elem, 'fs') sel_prefs_groups_fs_elem.attrib['type'] = 'sel_prefs_groups' write_synsets_sel_prefs(sel_prefs_groups_fs_elem, arg) write_predefined_sel_prefs(sel_prefs_groups_fs_elem, arg) write_relation_sel_prefs(sel_prefs_groups_fs_elem, arg, arg_base_id) write_synset_relation_sel_prefs(sel_prefs_groups_fs_elem, arg) def write_synsets_sel_prefs(parent_elem, arg): synsets = arg.selective_preference.synsets if synsets.exists(): synsets_f_elem = etree.SubElement(parent_elem, 'f') synsets_f_elem.attrib['name'] = 'synsets' vColl_elem = etree.SubElement(synsets_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for synset in synsets.all(): write_synset(vColl_elem, synset) def write_synset(parent_elem, synset): id_numeric_elem = etree.SubElement(parent_elem, 'numeric') id_numeric_elem.attrib['value'] = str(synset.id) def write_predefined_sel_prefs(parent_elem, arg): generals = arg.selective_preference.generals if generals.exists(): predefs_f_elem = etree.SubElement(parent_elem, 'f') predefs_f_elem.attrib['name'] = 'predefs' vColl_elem = etree.SubElement(predefs_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for predef in generals.all(): write_predef(vColl_elem, predef) def write_predef(parent_elem, predef): name_symbol_elem = etree.SubElement(parent_elem, 'symbol') name_symbol_elem.attrib['value'] = predef.name def write_relation_sel_prefs(parent_elem, arg, arg_base_id): relations = arg.selective_preference.relations if relations.exists(): relations_f_elem = etree.SubElement(parent_elem, 'f') relations_f_elem.attrib['name'] = 'relations' vColl_elem = etree.SubElement(relations_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for relation in relations.all(): write_relation(vColl_elem, relation, arg_base_id) def write_relation(parent_elem, relation, arg_base_id): relation_fs_elem = etree.SubElement(parent_elem, 'fs') relation_fs_elem.attrib['type'] = 'relation' relation_f_elem = etree.SubElement(relation_fs_elem, 'f') relation_f_elem.attrib['name'] = 'type' type_symbol_elem = etree.SubElement(relation_f_elem, 'symbol') type_symbol_elem.attrib['value'] = relation.relation.name to_f_elem = etree.SubElement(relation_fs_elem, 'f') to_f_elem.attrib['name'] = 'to' to_xml_link = '#%s.%d-arg' % (arg_base_id, relation.to.id) arg_link_elem = etree.SubElement(to_f_elem, 'fs') arg_link_elem.attrib['sameAs'] = to_xml_link arg_link_elem.attrib['type'] = 'argument' def write_synset_relation_sel_prefs(parent_elem, arg): relations = arg.selective_preference.synset_relations if relations.exists(): relations_f_elem = etree.SubElement(parent_elem, 'f') relations_f_elem.attrib['name'] = 'synset_relations' vColl_elem = etree.SubElement(relations_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for relation in relations.all(): write_synset_relation(vColl_elem, relation) def write_synset_relation(parent_elem, relation): relation_fs_elem = etree.SubElement(parent_elem, 'fs') relation_fs_elem.attrib['type'] = 'synset_relation' relation_f_elem = etree.SubElement(relation_fs_elem, 'f') relation_f_elem.attrib['name'] = 'type' type_symbol_elem = etree.SubElement(relation_f_elem, 'symbol') type_symbol_elem.attrib['value'] = relation.relation.name to_f_elem = etree.SubElement(relation_fs_elem, 'f') to_f_elem.attrib['name'] = 'to' write_synset(to_f_elem, relation.to) def write_meanings_layer(parent_elem, lemma): meanings_layer_elem = etree.SubElement(parent_elem, 'fs') meanings_layer_elem.attrib['type'] = 'meanings_layer' meanings_f_elem = etree.SubElement(meanings_layer_elem, 'f') meanings_f_elem.attrib['name'] = 'meanings' vColl_elem = etree.SubElement(meanings_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' write_meanings(vColl_elem, lemma) def write_meanings(parent_elem, lemma): entry = lemma.entry_obj for lex_unit in entry.meanings.all(): write_meaning(parent_elem, entry, lex_unit) def write_meaning(parent_elem, entry, lex_unit): meaning_xml_id = u'wal_%d.%d-mng' % (entry.id, lex_unit.id) meaning_fs_elem = etree.SubElement(parent_elem, 'fs') meaning_fs_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = meaning_xml_id meaning_fs_elem.attrib['type'] = 'lexical_unit' name_f_elem = etree.SubElement(meaning_fs_elem, 'f') name_f_elem.attrib['name'] = 'name' name_content_elem = etree.SubElement(name_f_elem, 'string') name_content_elem.text = lex_unit.base variant_f_elem = etree.SubElement(meaning_fs_elem, 'f') variant_f_elem.attrib['name'] = 'variant' variant_string_elem = etree.SubElement(variant_f_elem, 'string') variant_string_elem.text = lex_unit.sense plwnluid_f_elem = etree.SubElement(meaning_fs_elem, 'f') plwnluid_f_elem.attrib['name'] = 'plwnluid' plwnluid_numeric_elem = etree.SubElement(plwnluid_f_elem, 'numeric') plwnluid_numeric_elem.attrib['value'] = str(lex_unit.luid) if lex_unit.glossa: gloss_f_elem = etree.SubElement(meaning_fs_elem, 'f') gloss_f_elem.attrib['name'] = 'gloss' gloss_content_elem = etree.SubElement(gloss_f_elem, 'string') gloss_content_elem.text = lex_unit.glossa def write_connections_layer(parent_elem, lemma): connections_layer_elem = etree.SubElement(parent_elem, 'fs') connections_layer_elem.attrib['type'] = 'connections_layer' alternations_f_elem = etree.SubElement(connections_layer_elem, 'f') alternations_f_elem.attrib['name'] = 'alternations' vColl_elem = etree.SubElement(alternations_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' write_alternations(vColl_elem, lemma) def write_alternations(parent_elem, lemma): entry = lemma.entry_obj frames = entry.actual_frames() for schema in lemma.frames.all(): for frame in frames: matching_complements = frame.complements.filter(realizations__frame=schema).distinct() write_alternation(parent_elem, entry, schema, frame, matching_complements, 1) write_alternation(parent_elem, entry, schema, frame, matching_complements, 2) def write_alternation(parent_elem, entry, schema, frame, complements, alternation): alternation_compls = complements.filter(realizations__alternation=alternation) if alternation_compls.exists(): first_connection = True for arg in alternation_compls.all(): alt_realizations = arg.realizations.filter(frame=schema, alternation=alternation) if alt_realizations.exists(): if first_connection: alternation_fs_elem = etree.SubElement(parent_elem, 'fs') alternation_fs_elem.attrib['type'] = 'alternation' connections_f_elem = etree.SubElement(alternation_fs_elem, 'f') connections_f_elem.attrib['name'] = 'connections' vColl_elem = etree.SubElement(connections_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' first_connection = False write_connection(vColl_elem, entry, frame, arg, alt_realizations) def write_connection(parent_elem, entry, frame, arg, realizations): connection_fs_elem = etree.SubElement(parent_elem, 'fs') connection_fs_elem.attrib['type'] = 'connection' write_argument(connection_fs_elem, entry, frame, arg) write_phrases(connection_fs_elem, entry, realizations) def write_argument(parent_elem, entry, frame, arg): arg_f_elem = etree.SubElement(parent_elem, 'f') arg_f_elem.attrib['name'] = 'argument' arg_link = u'#wal_%d.%d.%d-arg' % (entry.id, frame.id, arg.id) arg_link_fs_elem = etree.SubElement(arg_f_elem, 'fs') arg_link_fs_elem.attrib['sameAs'] = arg_link arg_link_fs_elem.attrib['type'] = 'argument' def write_phrases(parent_elem, entry, realizations): phrases_f_elem = etree.SubElement(parent_elem, 'f') phrases_f_elem.attrib['name'] = 'phrases' vColl_elem = etree.SubElement(phrases_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for realization in realizations: phrase_xml_link = u'#wal_%d.%d.%d.%d-phr' % (entry.id, realization.frame.id, realization.position.id, realization.argument.id) phrase_link_elem = etree.SubElement(vColl_elem, 'fs') phrase_link_elem.attrib['sameAs'] = phrase_xml_link phrase_link_elem.attrib['type'] = 'phrase' ############# phrase types expansions def write_phrase_types_expansions_in_TEI(outpath): root = write_root() write_header(root, True) write_expansions_entries(root) with open(outpath, 'w') as output_file: output_file.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='UTF-8', doctype=u'<!DOCTYPE TEI SYSTEM "tei_all.dtd">')) def write_expansions_entries(root): phrase_types = Argument.objects.annotate(extensions_count=Count('realizations')) phrase_types_with_expansions = phrase_types.filter(extensions_count__gt=0) text = etree.SubElement(root, 'text') body = etree.SubElement(text, 'body') for phrase_type in phrase_types_with_expansions.order_by('text_rep'): expansions_xml_id = 'wal_%d-exp' % phrase_type.id expansions_elem = etree.SubElement(body, 'entry') expansions_elem.attrib[etree.QName(XML_NAMESPACE, 'id')] = expansions_xml_id write_main_phrase_type(expansions_elem, phrase_type) def write_main_phrase_type(parent_elem, phrase_type): write_phrase(parent_elem, phrase_type, '', False) write_expansions(parent_elem, phrase_type) def write_expansions(parent_elem, phrase_type): expansions_fs_elem = etree.SubElement(parent_elem, 'fs') expansions_fs_elem.attrib['type'] = 'phrase_type_expansions' expansions_f_elem = etree.SubElement(expansions_fs_elem, 'f') expansions_f_elem.attrib['name'] = 'expansions' for expansion in phrase_type.realizations.order_by('opinion__priority', 'type__priority', 'argument__text_rep'): expansion_fs_elem = etree.SubElement(expansions_f_elem, 'fs') expansion_fs_elem.attrib['type'] = 'expansion' opinion_f_elem = etree.SubElement(expansion_fs_elem, 'f') opinion_f_elem.attrib['name'] = 'opinion' opinion_symbol = etree.SubElement(opinion_f_elem, 'symbol') opinion_symbol.attrib['value'] = unicode(expansion.opinion) if expansion.type.sym_name == 'positions': positions_f_elem = etree.SubElement(expansion_fs_elem, 'f') positions_f_elem.attrib['name'] = 'positions' vColl_elem = etree.SubElement(positions_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' for position in sort_positions(expansion.positions.all()): write_position_elem(vColl_elem, '', position) elif expansion.type.sym_name == 'phrase_type': phrases_f_elem = etree.SubElement(expansion_fs_elem, 'f') phrases_f_elem.attrib['name'] = 'phrases' vColl_elem = etree.SubElement(phrases_f_elem, 'vColl') vColl_elem.attrib['org'] = 'set' # pozostawione na przyszlosc write_phrase(vColl_elem, expansion.argument, '')