largescale_segmentation.py 14.4 KB
# -*- coding: utf-8 -*-
from __future__ import print_function
from lxml import etree
import xml_utils
import os, sys
from nltk import tokenize
import srx_segmenter
import codecs

class LogicError(Exception):
    pass

IGNORE_LACK_OF_IDS = True

# tymczasowo:
structure_path = ""

# tagi na ktorych otwieramy nowe akapity:
#   dowolny tag w ramach front i back poza:
#     pb
#     figure
#     note
#     fw
#
#   w body:
#     head
#     p
#

#   latajace swobodnie cokolwiek tekstowego (poza zasiegiem powyzszych):
#       figure
#       fw
#       note
#       (zwykly tekst)
def get_string_value(element):
    text = element.text or ''
    for child in element:
        text += get_string_value(child)
    text += element.tail or ''
    return text

def teins(x):
    return "{%s}" % xml_utils.namespaces['tei']+x

def xmlns(x):
    return "{%s}" % xml_utils.namespaces['xml']+x

def extract_tag(x):
    return x[x.find("}")+1:]

def end_par(paras, cur_par):
    if cur_par[0] and len(cur_par[0]) > 0:
        paras[0].append(cur_par[0])
        cur_par[0] = None
        if len(paras[1]) > 0:
            paras[0].extend(paras[1])
            del paras[1][:]
    if len(paras[1]) > 0:
        raise LogicError("odwleczone akapity istnieja pomimo, ze nie ma aktualnego akapitu")

def id_or_none(elem, optional = False):
    if not IGNORE_LACK_OF_IDS and not optional and xmlns("id") not in elem.attrib:
        raise LogicError("brak xml:id w " + etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
    return elem.attrib[xmlns("id")] if xmlns("id") in elem.attrib else None

def obtain_inner_text(elem, parentid, paras, cur_par, in_par, text_expected = True):
    offset = 0
    if elem.text:
        if cur_par[0] is None:
            if text_expected:
                cur_par[0] = []
            elif len(elem.text.strip()) > 0:
                raise LogicError("niespodziewany tekst w " + etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
        if cur_par[0] is not None:
            cur_par[0].append({'id': parentid, 'text' : elem.text, 'xml_offset' : 0})
            offset += len(elem.text)
    for child in elem.iterchildren():
        process_elt(child, paras, cur_par, in_par)

        offset += len(get_string_value(child))

        if child.tail:
            if cur_par[0] is None:
                if text_expected:
                    cur_par[0] = []
                elif len(child.tail.strip()) > 0:
                    raise LogicError("niespodziewany tekst w " + etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
            if cur_par[0] is not None:
                cur_par[0].append({'id': parentid, 'text' : child.tail, 'xml_offset' : offset-len(child.tail)})
                offset += len(child.tail)

def process_elt(elem, paras, cur_par, in_par = False):
    #print("PE:", structure_path)
    #print(etree.tostring(elem))

    if elem.tag in [teins(x) for x in ("front", "body", "titlePage", "back", "div")]:
        end_par(paras, cur_par)
        for child in elem.iterchildren():
            process_elt(child, paras, cur_par, in_par)
        
    elif elem.tag in [teins(x) for x in ("p", "head", "docTitle", "docImprint", "docAuthor")]:
        new_in_par = in_par
        if not in_par:
            end_par(paras, cur_par)
            cur_par[0] = []
            if elem.tag == teins("p"):
                new_in_par = True
        #elif elem.tag == teins("p"):
            #print("plik:", structure_path)
            #print(etree.tostring(elem, encoding = "utf-8").decode("utf-8"))

        obtain_inner_text(elem, id_or_none(elem), paras, cur_par, new_in_par)
        if not in_par:
            end_par(paras, cur_par)

    elif elem.tag in [teins(x) for x in ("titlePart", "docDate")]:
        obtain_inner_text(elem, id_or_none(elem), paras, cur_par, in_par)

    elif elem.tag == teins('pb'):
        pass

    elif elem.tag == teins('foreign'):
        if elem.text is None and len(list(elem.iterchildren())) == 0:
            if xmlns("lang") in elem.attrib and elem.attrib[xmlns("lang")] == "x-nlws":
                if cur_par[0] is not None:
                    cur_par[0].append({"text" : None, "id" : id_or_none(elem, True), "type" : "foreign/nonlatin"})
        else:
            if cur_par[0] is None:
                raise LogicError("bledne foreign "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
            # foreign lapiemy na boku a potem ustawiamy flage type dla calej zawartosci
            aparas = ([],[])
            acur_par = [[]]
            obtain_inner_text(elem, id_or_none(elem), aparas, acur_par, in_par)
            if len(aparas[0]) > 0:
                raise LogicError("problem z foreign: zagniezdzony akapit? "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
            for x in acur_par[0]:
                if "type" not in x:
                    x["type"] = "foreign" 
                cur_par[0].append(x)
            for adp in aparas[1]:
                dp = []
                for x in adp:
                    if "type" not in x:
                        x["type"] = "foreign" 
                    dp.append(x)
                paras[1].append(dp)


    elif elem.tag == teins('choice'):
        choices = list(elem.iterchildren())
        if (elem.text and len(elem.text.strip()) > 0) or cur_par is None or len(choices) != 2:
            raise LogicError("bledne choice "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))

        entry = {"text" : None, "orig_text" : None}
        cur_par[0].append(entry)
        for c in choices:
            if c.tag == teins("orig"):
                if entry["orig_text"]:
                    raise LogicError("bledne choice "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
                entry["orig_text"] = c.text
                entry["orig_id"] = id_or_none(c)
            elif c.tag == teins("reg"):
                if entry["text"]:
                    raise LogicError("bledne choice "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
                entry["text"] = c.text
                entry["xml_offset"] = 0
                entry["id"] = id_or_none(c)
            else:
                raise LogicError("bledne choice "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
            if (c.tail and len(c.tail.strip()) > 0) or len(list(c.iterchildren())) > 0:
                raise LogicError("bledne choice "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))

    elif elem.tag == teins("fw"):
        aparas = ([], [])
        acur_par = [[]]
        obtain_inner_text(elem, id_or_none(elem), aparas, acur_par, in_par)
        if len(aparas[0]) > 0 or len(aparas[1]) > 0:
            raise LogicError("bledny fw "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
        # zywe paginy wyrzucam poza (za) aktualny akapit (jesli jestesmy w akapicie)
        if cur_par[0] is not None:
            paras[1].append(acur_par[0])
        else:
            paras[0].append(acur_par[0])

    elif elem.tag == teins("note"):
        aparas = ([], [])
        acur_par = [[]]
        obtain_inner_text(elem, id_or_none(elem, True), aparas, acur_par, in_par)
        if xmlns("id") not in elem.attrib and not IGNORE_LACK_OF_IDS:
            if len(acur_par[0]) > 0:
                raise LogicError("bledne note " +etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
        else:
            if len(aparas[0]) > 0 or len(aparas[1]) > 0:
                raise LogicError("bledne note "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
            # notki wyrzucam poza (za) aktualny akapit (jesli jestesmy w akapicie)
            if cur_par[0] is not None:
                paras[1].append(acur_par[0])
            else:
                paras[0].append(acur_par[0])

    elif elem.tag in [teins(x) for x in ["figure", "formula", "notatedMusic", "span"]]:
        if cur_par[0] is None:
            cur_par[0] = []
        cur_par[0].append({"text" : None, "id" : id_or_none(elem, True), "type" : extract_tag(elem.tag)})

    elif elem.tag == teins("g"):
        entry = {"text" : u"¤", "id" : None, "type" : extract_tag(elem.tag), "xml_offset" : 0}
        if cur_par[0] is not None:
            cur_par[0].append(entry)
        else:
            paras[0].append([entry])

    elif elem.tag == teins("gap"):
        if cur_par[0] is None:
            cur_par[0] = []
        cur_par[0].append({"text" : None, "id" : None, "type" : extract_tag(elem.tag)})


    elif elem.tag == teins('sic'):
        if cur_par[0] is None:
            raise LogicError("bledne sic "+etree.tostring(elem, encoding = "utf-8").decode("utf-8"))
        obtain_inner_text(elem, id_or_none(elem), paras, cur_par, in_par)


def do_segmentation(plaintext):
    srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx')
    rules = srx_segmenter.parse(srx_filepath)

    segmenter = srx_segmenter.SrxSegmenter(rules["Polish"], plaintext)
    sents, whitespaces = segmenter.extract()

    #sents = tokenize.sent_tokenize(plaintext)
    return sents

def make_sents(parafrags):
    plaintext = u""
    for f in parafrags:
        start = len(plaintext)
        if "text" in f and f["text"]:
            plaintext += f["text"]
        f["start"] = start
        f["end"] = len(plaintext)
        #print(repr(f))

    #print
    #print

    sent_list = do_segmentation(plaintext)
    plainpos = 0
    sent_frags = []
    last_frag = 0
    for sent in sent_list:
        sentpos = 0
        if len(sent.strip()) > 0:
            plainsent = sent.lstrip()
            while plaintext[plainpos] != plainsent[sentpos] and len(plaintext[plainpos].strip()) == 0:
                plainpos += 1
                if plainpos > len(plaintext):
                    raise LogicError("tekst sie skonczyl przedwczesnie, "+str(plainpos)+", "+plaintext+", "+sentpos+". "+plainsent)

            start = plainpos

            while sentpos < len(plainsent):
                if plainpos > len(plaintext):
                    raise LogicError("tekst sie skonczyl przedwczesnie, "+str(plainpos)+", "+plaintext+", "+sentpos+". "+plainsent)
                if plaintext[plainpos] != plainsent[sentpos]:
                    raise LogicError("konflikt na tekscie, "+str(plainpos)+", "+plaintext+", "+sentpos+". "+plainsent)
                sentpos += 1
                plainpos += 1

            end = plainpos

            frags = []
            while last_frag < len(parafrags) and parafrags[last_frag]["start"] <= end:
                frg = parafrags[last_frag]
                if frg["start"] >= start and frg["end"] <= end:
                    last_frag += 1
                    if "text" in frg:
                        frg["offset"] = 0
                    frags.append(frg)
                elif frg["end"] <= end:
                    last_frag += 1
                    nfrg = {"text" : None}
                    if frg["text"]:
                        offset = start-frg["start"]
                        nfrg["text"] = frg["text"][offset:]
                        nfrg["offset"] = offset 
                        if "orig_text" in frg:
                            nfrg["orig_text"] = frg["orig_text"][offset:l]

                    nfrg["id"] = frg["id"]
                    if "orig_id" in frg:
                        nfrg["orig_id"] = frg["orig_id"]
                    if "type" in frg:
                        nfrg["type"] = frg["type"]
                    if "xml_offset" in frg:
                        nfrg["xml_offset"] = frg["xml_offset"]

                    frags.append(nfrg)
                else:
                    nfrg = {"text" : None}
                    if frg["text"]:
                        offset = 0 if frg["start"] >= start else start-frg["start"]
                        l = end-frg["start"]
                        nfrg["text"] = frg["text"][offset:l]
                        nfrg["offset"] = offset 
                        if "orig_text" in frg:
                            nfrg["orig_text"] = frg["orig_text"][offset:l]

                    nfrg["id"] = frg["id"]
                    if "orig_id" in frg:
                        nfrg["orig_id"] = frg["orig_id"]
                    if "type" in frg:
                        nfrg["type"] = frg["type"]
                    if "xml_offset" in frg:
                        nfrg["xml_offset"] = frg["xml_offset"]

                    frags.append(nfrg)
                    break
                if frags[-1]["text"] is not None and "offset" not in frags[-1]:
                    print(repr(frags[1]))
            #print(plainsent)
            sent_frags.append(frags)
            #print(repr(frags))
            #print
            while last_frag < len(parafrags) and parafrags[last_frag]["text"]:
                if len(parafrags[last_frag]["text"][end-parafrags[last_frag]["start"]:].strip()) == 0:
                    last_frag += 1
                else:
                    break

    if len(sent_frags) == 0 and len(parafrags) > 0:
        sent_frags.append([])
    if len(parafrags[last_frag:]) > 0:
        sent_frags[-1].extend(parafrags[last_frag:])

    filtered_frags = []
    for s in sent_frags:
        nf = []
        for f in s:
            if f["text"] is None or len(f["text"].strip()) > 0:
                nf.append(f)
                #print repr(f)
                if f["text"] is not None and "offset" not in f:
                    print(repr(f))
                    raise LogicError("??")
        if len(nf) > 0:
            filtered_frags.append(nf)

    return filtered_frags, sent_list

def parse_file(path):
    global structure_path
    print(path)
    structure_path = os.path.join(unicode(path, "utf-8"), u'text_structure.xml')
    text_tree = etree.parse(structure_path)
    paras = ([], [])
    cur_par = [None]
    text = list(text_tree.xpath("tei:TEI/tei:text", namespaces=xml_utils.namespaces))
    if len(text) != 1:
        raise LogicError("bledne TEI\n\n" + str(text))
    for elem in text[0].iterchildren():
        process_elt(elem, paras, cur_par)
    end_par(paras, cur_par)

    parasents = []
    paralist = []
    for para in paras[0]:
        paras, plainsents = make_sents(para)
        parasents.append(paras)
        paralist.append(plainsents)


    # TODO tymczasowe wypluwanie zdan prostym tekstem na potrzeby inspekcji segmentacji zdaniowej
    plaintext_path = os.path.join(unicode(path, "utf-8"), u'zdania.txt')
    fo = codecs.open(plaintext_path, "w", encoding="utf-8")
    for sent_list in paralist:
        for s in sent_list:
            print(s.strip()+u"\n", file = fo)
        print(u"\n\n\n", file = fo)
    fo.close()

    return parasents