tei_writer.py 12.3 KB
# -*- coding: utf-8 -*-

import os, sys
import codecs
from xml_utils import xml_escape
import settings
from dag_tools import read_text

SEGMENTATION = u"ann_segmentation.xml"
MORPHOSYNTAX = u"ann_morphosyntax.xml"
TAGER = u"text_transcr.txt.tagged"

class LogicError(Exception):
    pass


def write_header(f):
    f.write('''<?xml version="1.0" encoding="UTF-8"?>
<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xmlns="http://www.tei-c.org/ns/1.0">
  <xi:include href="KORBA_header.xml"/>
  <TEI>
    <xi:include href="header.xml"/>
    <text>
      <body>
''')

def write_footer(f):
    f.write('''      </body>
    </text>
  </TEI>
</teiCorpus>
''')


def start_tag(f, tg, xmlid, corresp, indent=u""):
    line = indent+u"        <{} xml:id=\"{}\"".format(tg, xmlid)
    if corresp is not None:
        line += u" corresp=\"{}\"".format(corresp)
    line += u">\n"
    f.write(line)

def end_tag(f, tg, indent=u""):
    f.write(indent+u"        </{}>\n".format(tg))


def incscntr(seg, cntrptr, origchoice = False):
    if isinstance(seg, dict):
        cntrptr[0] += 1
    elif isinstance(seg, list):
        if origchoice:
            if len(seg) != 2:
                print(repr(seg))
                raise LogicError("niepoprawny choice")

            for i, c in enumerate(seg):
                for s in c:
                    incscntr(s, cntrptr, False)
        else:
            for c in seg:
                for s in c:
                    incscntr(s, cntrptr, False)
    else:
        raise LogicError("niespodziewany smiec zamiast segmentu "+repr(seg))



def write_seg(fs, fm, seg, cntrptr, pcnt, xmlid, typ, tager, rejected, oxmlid = None, ind = u""):
    global dag
    global dagcnt
    indent = ind+u"            "
    if isinstance(seg, dict):
#        print u"{}\t{}\t{}".format(seg['start'], seg['end'], seg['orth']).encode('utf-8')
        seg["segid"] = "segm_{}.{}-seg".format(pcnt, cntrptr[0])
        seg["morphid"] = "morph_{}.{}-seg".format(pcnt, cntrptr[0])
        
        # zapis segmentu w segmentacji
        if xmlid is not None:
            lines = indent+u"<seg xml:id=\"{}\" corresp=\"{}#string-range({},{},{})\"".format(seg["segid"], source_filename, xmlid, seg["source_offset"], len(seg["source_orth"]))
        else:
            lines = indent+u"<seg xml:id=\"{}\"".format(seg["segid"])

        if typ:
            lines += u" type=\"{}\"".format(typ)
        if "nps" in seg:
            lines += u" nkjp:nps=\"true\""
        if tager and rejected:
            lines += u" nkjp:rejected=\"true\""
        lines += ">\n"
        lines += indent+u"  <w>{}</w>\n".format(xml_escape(seg["source_orth"]))
        lines += indent+u"</seg>\n"

        fs.write(lines)

        # zapis segmentu w morfoskladni

        lines = indent+u"<seg xml:id=\"{}\" corresp=\"{}#{}\">\n".format(seg["morphid"], SEGMENTATION, seg["segid"])
        lines += indent+u"  <fs type=\"morph\">\n"
        lines += indent+u"    <f name=\"orth\">\n"
        lines += indent+u"      <string>{}</string>\n".format(xml_escape(seg["orth"]))
        lines += indent+u"    </f>\n"
        lines += indent+u"    <f name=\"translit\">\n"
        lines += indent+u"      <string>{}</string>\n".format(xml_escape(seg["source_orth"]))
        lines += indent+u"    </f>\n"
        if "nps" in seg:  # w pliku ann_morphosyntax też powinna być informacja o braku spacji
          lines += indent+u"    <f name=\"nps\">\n"
          lines += indent+u"      <binary value=\"true\"/>\n"
          lines += indent+u"    </f>\n"
        lines += indent+u"    <f name=\"interps\">\n"
        bctgmsd = {}
        for i in seg["interps"]:
            k = (i["base"], i["ctag"])
            if k not in bctgmsd:
                bctgmsd[k] = []
            bctgmsd[k].append(i["msd"])

        lid = 1
        msdid = 1
        tagid = ''
        for k, v in bctgmsd.items():
            lines += indent+u"      <fs type=\"lex\" xml:id=\"morph_{}.{}.{}-lex\">\n".format(pcnt, cntrptr[0], lid)
            lines += indent+u"        <f name=\"base\">\n"
            lines += indent+u"          <string>{}</string>\n".format(xml_escape(k[0]))
            lines += indent+u"        </f>\n"
            lines += indent+u"        <f name=\"ctag\">\n"
            lines += indent+u"          <symbol value=\"{}\"/>\n".format(k[1])
            lines += indent+u"        </f>\n"
            lines += indent+u"        <f name=\"msd\">\n"
            if len(v) > 1:
                lines += indent+u"          <vAlt>\n"
                for m in v:
                    lines += indent+u"            <symbol value=\"{}\" xml:id=\"morph_{}.{}.{}.{}-msd\"/>\n".format(m, pcnt, cntrptr[0], lid, msdid)
                    if tager and not rejected:
                        if seg['orth'] == dag[dagcnt]['orth']:
                            seg_interp = u"{}:{}:{}".format(k[0], k[1], m) if m else u"{}:{}".format(k[0], k[1])
                            if dag[dagcnt]['interpretation'] == seg_interp:
                                tagid = u"#morph_{}.{}.{}.{}-msd".format(pcnt, cntrptr[0], lid, msdid)
                        else:
                            raise LogicError(u"niezgodnosc segmentow w tekscie: {} i tagerze: {}".format(seg['orth'], dag[dagcnt]['orth']).encode('utf-8'))
                    msdid += 1
                lines += indent+u"          </vAlt>\n"
            else:
                lines += indent+u"          <symbol value=\"{}\" xml:id=\"morph_{}.{}.{}.{}-msd\"/>\n".format(v[0], pcnt, cntrptr[0], lid, msdid)
                if tager and not rejected:
                    if seg['orth'] == dag[dagcnt]['orth']:
                        seg_interp = u"{}:{}:{}".format(k[0], k[1], v[0]) if v[0] else u"{}:{}".format(k[0], k[1])
                        if dag[dagcnt]['interpretation'] ==  seg_interp:
                            tagid = u"#morph_{}.{}.{}.{}-msd".format(pcnt, cntrptr[0], lid, msdid)
                    else:
                        raise LogicError(u"niezgodnosc segmentow w tekscie: {} i tagerze: {}".format(seg['orth'], dag[dagcnt]['orth']).encode('utf-8'))
                msdid += 1

            lines += indent+u"        </f>\n"
            
            lines += indent+u"      </fs>\n"
            lid += 1
            msdid = 1

        lines += indent+u"    </f>\n"

        if tager and not rejected:
            lines += indent+u"    <f name=\"disamb\">\n"
            lines += indent+u"      <fs type=\"tool_report\">\n"
            lines += indent+u"        <f name=\"choice\" fVal=\"{}\"/>\n".format(tagid)
            lines += indent+u"        <f name=\"interpretation\">\n"
            lines += indent+u"          <string>{}</string>\n".format(dag[dagcnt]['interpretation'])
            lines += indent+u"        </f>\n"
            lines += indent+u"      </fs>\n"
            lines += indent+u"    </f>\n"
        if tager and not rejected:
            dagcnt += 1

        lines += indent+u"  </fs>\n"
        lines += indent+u"</seg>\n"


        fm.write(lines)


        cntrptr[0] += 1
    elif isinstance(seg, list):
        if tager:
            # Sprawdź, czy nie zaczynają się tak samo
            word_startsegs = [choice[0]['orth'] for choice in seg]
            if len(set(word_startsegs)) != len(word_startsegs):
                raise LogicError("kilka wariantow segm. zaczyna sie tak samo "+repr(seg))
        fs.write(indent+u"<choice>\n")
        if oxmlid:
            if len(seg) != 2:
                print(repr(seg))
                raise LogicError("niepoprawny choice")
            xids = [xmlid, oxmlid]
            for i, c in enumerate(seg):
                if tager:
                    if c[0]['orth'] == dag[dagcnt]['orth']:
                        rejected = False
                    else:
                        rejected = True
                else:
                    rejected = False
                      
                xind = ind+u"  "
                if len(c) > 1:
                    fs.write(indent+u"  <nkjp:paren>\n")
                    xind += u"  "
                for s in c:
                    write_seg(fs, fm, s, cntrptr, pcnt, xids[i], typ, tager, rejected, None, xind)
                if len(c) > 1:
                    fs.write(indent+u"  </nkjp:paren>\n")

        else:
#            print u"pierwszy: {}, drugi: {}".format(seg[0][0]['orth'], seg[1][0]['orth']).encode('utf-8')
            for c in seg:
                if tager:
                    if c[0]['orth'] == dag[dagcnt]['orth']:
                        rejected = False
                    else:
                        rejected = True
                else:
                    rejected = False

                xind = ind+u"  "
                if len(c) > 1:
                    fs.write(indent+u"  <nkjp:paren>\n")
                    xind += u"  "
                for s in c:
                    write_seg(fs, fm, s, cntrptr, pcnt, xmlid, typ, tager, rejected, None, xind)
                if len(c) > 1:
                    fs.write(indent+u"  </nkjp:paren>\n")
        fs.write(indent+u"</choice>\n")
    else:
        raise LogicError("niespodziewany smiec zamiast segmentu "+repr(seg))

def write_special_seg(f, frag, tager):
    # zapis segmentu specjalnego tylko w segmentacji
    lines = u"            <seg xml:id=\"{}\" type=\"{}\"/>\n".format(frag["segid"], frag["type"])
    f.write(lines)
#    dagcnt += 1 # To segment specjalny, bez treści - nie będzie go w pliku po tagerze.

def write_files(path, parasents, source_fn, tager):
    global source_filename
    source_filename = source_fn

    #segmentation_path = os.path.join(unicode(path, "utf-8"), SEGMENTATION)
    #morphosyntax_path = os.path.join(unicode(path, "utf-8"), MORPHOSYNTAX)
    segmentation_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, SEGMENTATION)
    morphosyntax_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, MORPHOSYNTAX)
    
    if tager:
      global dag
      global dagcnt
      dag = read_text(os.path.join(unicode(path, "utf-8"), settings.SUBFOL, TAGER))
      dagcnt = 0
      
    segm = codecs.open(segmentation_path, mode='w', encoding='utf-8')
    morpho = codecs.open(morphosyntax_path, mode='w', encoding='utf-8')

    write_header(segm)
    write_header(morpho)

    for pci, p in enumerate(parasents):
        segcntr = [1]
        pcnt = pci + 1
        start_tag(segm, "p", "segm_{}-p".format(pcnt), None)
        start_tag(morpho, "p", "morph_{}-p".format(pcnt), SEGMENTATION+"#segm_{}-p".format(pcnt))

        for s in p:
            scnt = [segcntr[0]-1]
            # oblicz scnt tak, jak to jest w NKJP (czyli idiotycznie - nr ostatniego segmentu)
            for frag in s:
                if "orig_segs" in frag:
                    if "segs" in frag:
                        incscntr([frag["segs"], frag["orig_segs"]], scnt, True)
                    else:
                        for seg in frag["orig_segs"]:
                            incscntr(seg, scnt)
                elif "segs" in frag:
                    for seg in frag["segs"]:
                        incscntr(seg, scnt)
                else:
                    scnt[0] += 1

            
            scnt = scnt[0]

            start_tag(segm, "s", "segm_{}.{}-s".format(pcnt, scnt), None, u"  ")
            start_tag(morpho, "s", "morph_{}.{}-s".format(pcnt, scnt), SEGMENTATION+"#segm_{}.{}-s".format(pcnt, scnt), u"  ")
                
            for frag in s:
                if "orig_segs" in frag:
                    if "segs" in frag:
                        write_seg(segm, morpho, [frag["segs"], frag["orig_segs"]], segcntr, pcnt, frag["id"], frag["type"] if "type" in frag else None, tager, False, frag["orig_id"])
                    else:
                        for seg in frag["orig_segs"]:
                            write_seg(segm, morpho, seg, segcntr, pcnt, frag["orig_id"], frag["type"] if "type" in frag else None, tager, False)
                elif "segs" in frag:
                    for seg in frag["segs"]:
                        write_seg(segm, morpho, seg, segcntr, pcnt, frag["id"], frag["type"] if "type" in frag else None, tager, False)
                else:
                    frag["segid"] = "segm_{}.{}-seg".format(pcnt, segcntr[0])
                    segcntr[0] += 1
                    write_special_seg(segm, frag, tager)


            end_tag(segm, "s", u"  ")
            end_tag(morpho, "s", u"  ")

        end_tag(segm, "p")
        end_tag(morpho, "p")

    write_footer(segm)
    write_footer(morpho)	

    segm.close()
    morpho.close()