tei_writer.py 7.18 KB
# -*- coding: utf-8 -*-

import os, sys
import codecs

SEGMENTATION = u"ann_segmentation.xml"
MORPHOSYNTAX = u"ann_morphosyntax.xml"


class LogicError(Exception):
    pass


def write_header(f):
    f.write('''<?xml version="1.0" encoding="UTF-8"?>
<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xmlns="http://www.tei-c.org/ns/1.0">
  <xi:include href="KORBA_header.xml"/>
  <TEI>
    <xi:include href="header.xml"/>
    <text>
      <body>
''')

def write_footer(f):
    f.write('''      </body>
    </text>
  </TEI>
</teiCorpus>
''')


def start_tag(f, tg, xmlid, corresp, indent=u""):
    line = indent+u"        <{} xml:id=\"{}\"".format(tg, xmlid)
    if corresp is not None:
        line += u" corresp=\"{}\"".format(corresp)
    line += u">\n"
    f.write(line)

def end_tag(f, tg, indent=u""):
    f.write(indent+u"        </{}>\n".format(tg))


def write_seg(fs, fm, seg, cntrptr, pcnt, scnt, xmlid, typ, oxmlid = None, ind = u""):
    indent = ind+u"            "
    if isinstance(seg, dict):
        seg["segid"] = "segm_{}.{}.{}-seg".format(pcnt, scnt, cntrptr[0])
        seg["morphid"] = "morph_{}.{}.{}-seg".format(pcnt, scnt, cntrptr[0])

        # zapis segmentu w segmentacji
        if xmlid is not None:
            lines = indent+u"<seg xml:id=\"{}\" corresp=\"{}#string-range({},{},{})\"".format(seg["segid"], source_filename, xmlid, seg["source_offset"], len(seg["source_orth"]))
        else:
            lines = indent+u"<seg xml:id=\"{}\"".format(seg["segid"])

        if typ:
            lines += u" type=\"{}\"".format(typ)
        if "nps" in seg:
            lines += u" nkjp:nps=\"true\""
        lines += ">\n"
        lines += indent+u"  <w>{}</w>\n".format(seg["source_orth"])
        lines += indent+u"</seg>\n"

        fs.write(lines)

        # zapis segmentu w morfoskladni

        lines = indent+u"<seg xml:id=\"{}\" corresp=\"{}#{}\">\n".format(seg["morphid"], SEGMENTATION, seg["segid"])
        lines += indent+u"  <fs type=\"morph\">\n"
        lines += indent+u"    <f name=\"orth\">\n"
        lines += indent+u"      <string>{}</string>\n".format(seg["orth"])
        lines += indent+u"    </f>\n"
        lines += indent+u"    <f name=\"translit\">\n"
        lines += indent+u"      <string>{}</string>\n".format(seg["source_orth"])
        lines += indent+u"    </f>\n"
        lines += indent+u"    <f name=\"interps\">\n"
        bctgmsd = {}
        for i in seg["interps"]:
            k = (i["base"], i["ctag"])
            if k not in bctgmsd:
                bctgmsd[k] = []
            bctgmsd[k].append(i["msd"])

        lid = 1
        msdid = 1
        for k, v in bctgmsd.items():
            lines += indent+u"      <fs type=\"lex\" xml:id=\"morph_{}.{}.{}.{}-lex\">\n".format(pcnt, scnt, cntrptr[0], lid)
            lines += indent+u"        <f name=\"base\">\n"
            lines += indent+u"          <string>{}</string>\n".format(k[0])
            lines += indent+u"        </f>\n"
            lines += indent+u"        <f name=\"ctag\">\n"
            lines += indent+u"          <symbol value=\"{}\"/>\n".format(k[1])
            lines += indent+u"        </f>\n"
            lines += indent+u"        <f name=\"msd\">\n"
            if len(v) > 1:
                lines += indent+u"          <vAlt>\n"
                for m in v:
                    lines += indent+u"            <symbol value=\"{}\" xml:id=\"morph_{}.{}.{}.{}.{}-msd\"/>\n".format(m, pcnt, scnt, cntrptr[0], lid, msdid)
                    msdid += 1
                lines += indent+u"          </vAlt>\n"
            else:
                lines += indent+u"          <symbol value=\"{}\" xml:id=\"morph_{}.{}.{}.{}.{}-msd\"/>\n".format(v[0], pcnt, scnt, cntrptr[0], lid, msdid)
                msdid += 1

            lines += indent+u"        </f>\n"

            lines += indent+u"      </fs>\n"
            lid += 1

        lines += indent+u"    </f>\n"
        lines += indent+u"  </fs>\n"
        lines += indent+u"</seg>\n"


        fm.write(lines)


        cntrptr[0] += 1
    elif isinstance(seg, list):
        fs.write(indent+u"<choice>\n")
        if oxmlid:
            if len(seg) != 2:
                print(repr(seg))
                raise LogicError("niepoprawny choice")

            xids = [xmlid, oxmlid]
            for i, c in enumerate(seg):
                xind = ind+u"  "
                if len(c) > 1:
                    fs.write(indent+u"  <nkjp:paren>\n")
                    axind += u"  "
                for s in c:
                    write_seg(fs, fm, s, cntrptr, pcnt, scnt, xids[i], typ, None, xind)
                if len(c) > 1:
                    fs.write(indent+u"  </nkjp:paren>\n")

        else:
            for c in seg:
                xind = ind+u"  "
                if len(c) > 1:
                    fs.write(indent+u"  <nkjp:paren>\n")
                    xind += u"  "
                for s in c:
                    write_seg(fs, fm, s, cntrptr, pcnt, scnt, xmlid, typ, None, xind)
                if len(c) > 1:
                    fs.write(indent+u"  </nkjp:paren>\n")
        fs.write(indent+u"</choice>\n")
    else:
        raise LogicError("niespodziewany smiec zamiast segmentu "+repr(seg))

def write_special_seg(f, frag):
    # zapis segmentu specjalnego tylko w segmentacji
    lines = u"            <seg xml:id=\"{}\" type=\"{}\"/>\n".format(frag["segid"], frag["type"])
    f.write(lines)

def write_files(path, parasents, source_fn):
    global source_filename
    source_filename = source_fn

    segmentation_path = os.path.join(unicode(path, "utf-8"), SEGMENTATION)
    morphosyntax_path = os.path.join(unicode(path, "utf-8"), MORPHOSYNTAX)
    
    segm = codecs.open(segmentation_path, mode='w', encoding='utf-8')
    morpho = codecs.open(morphosyntax_path, mode='w', encoding='utf-8')

    write_header(segm)
    write_header(morpho)

    for pci, p in enumerate(parasents):
        pcnt = pci + 1
        start_tag(segm, "p", "segm_{}-p".format(pcnt), None)
        start_tag(morpho, "p", "morph_{}-p".format(pcnt), SEGMENTATION+"#segm_{}-p".format(pcnt))

        for sci, s in enumerate(p):
            scnt = sci + 1
            start_tag(segm, "s", "segm_{}.{}-s".format(pcnt, scnt), None, u"  ")
            start_tag(morpho, "s", "morph_{}.{}-s".format(pcnt, scnt), SEGMENTATION+"#segm_{}.{}-s".format(pcnt, scnt), u"  ")
                
            segcntr = [1]
            for frag in s:
                if "orig_segs" in frag:
                    if "segs" in frag:
                        write_seg(segm, morpho, [frag["segs"], frag["orig_segs"]], segcntr, pcnt, scnt, frag["id"], frag["type"] if "type" in frag else None, frag["orig_id"])
                if "segs" in frag:
                    for seg in frag["segs"]:
                        write_seg(segm, morpho, seg, segcntr, pcnt, scnt, frag["id"], frag["type"] if "type" in frag else None)
                else:
                    frag["segid"] = "segm_{}.{}.{}-seg".format(pcnt, scnt, segcntr[0])
                    segcntr[0] += 1
                    write_special_seg(segm, frag)


            end_tag(segm, "s", u"  ")
            end_tag(morpho, "s", u"  ")

        end_tag(segm, "p")
        end_tag(morpho, "p")

    write_footer(segm)
    write_footer(morpho)	

    segm.close()
    morpho.close()