dataset_utils.py 12.1 KB

Edit Raw Blame History

from collections import Counter, defaultdict
from itertools import chain

from datasets import ClassLabel, Sequence

from morfeusz2 import Morfeusz

from .hybrid_tree_utils import tree_from_dataset_instance

from .constants import (
    FIRST,
    LAST,
    MASK_VALUE,
    EMPTY,
    UPPERCASE,
    LOWERCASE,
    SEG_BEGIN,
    SEG_INSIDE,
    TOKENS,
    SEGS,
    LEMMAS,
    LEMMA_CASES,
    LEMMA_RULES,
    TAGS,
    SPINES,
    ANCHORS,
    ANCHOR_HS,
    HEADS,
    ADJACENCY_MATRIX,
    NE_SPINES,
    NE_ANCHORS,
    NE_ANCHOR_HS,
    NE_HEADS,
    NE_ADJACENCY_MATRIX,
)

def make_lemma_rule(token, lemma, tag):
    case = UPPERCASE if lemma[0].isupper() else LOWERCASE
    prefix_cut = 0
    token, lemma = token.lower(), lemma.lower()
    #if lemma.startswith('naj') and  or
    if (token.startswith('nie') and 'neg' in tag) or (token.startswith('naj') and 'sup' in tag):
        prefix_cut = 3
        token = token[3:]
    cut = 0
    while token[:(cut + 1)] == lemma[:(cut + 1)] and cut < len(token):
        cut += 1
    suffix = lemma[cut:]
    cut = len(token) - cut
    return case, f'{prefix_cut}_{cut}_{suffix}'

def _add_lemma_rules(instance, tag_dict):
    tokens = instance[TOKENS]
    lemmas = instance[LEMMAS]
    tags = [tag_dict[v] for v in instance[TAGS]]
    cases, rules = zip(*(make_lemma_rule(*x) for x in zip(tokens, lemmas, tags)))
    return {
        LEMMA_CASES : cases,
        LEMMA_RULES : rules,
    }

def _none_to_mask(instance, columns):
    ret = dict()
    for column in columns:
        ret[column] = [v if v is not None else MASK_VALUE for v in instance[column]]
    return ret

def cast_labels(dataset, columns):
    vals = defaultdict(Counter)
    for d in dataset.values():
        for column in columns:
            vals[column].update(chain.from_iterable(s[column] for s in d))
    new_features = dataset['train'].features.copy()
    for column in columns:
        if None in vals[column]:
            vals[column].pop(None)
        new_features[column] = Sequence(ClassLabel(names=sorted(vals[column].keys())))
    # labels to indices
    cast_dataset = dataset.cast(new_features)
    # replace None with MASK_VALUE
    return cast_dataset.map(lambda instance: _none_to_mask(instance, columns))

def add_lemma_rules(dataset):
    tag_dict = dataset['train'].features[TAGS].feature.names
    new_dataset = dataset.map(lambda instance: _add_lemma_rules(instance, tag_dict))
    return cast_labels(new_dataset, [LEMMA_CASES, LEMMA_RULES])


def _do_collect_spines(tree):
    if not tree.children:
        return [tree], []
    heads = [child for child in tree.children if child.is_head]
    assert(len(heads) == 1)
    head = heads[0]
    paths = []
    my_path = [tree]
    non_heads = []
    for child in tree.children:
        child_path, grandchildren_paths = _do_collect_spines(child)
        paths += grandchildren_paths
        if child == head:
            my_path += child_path
        else:
            non_heads.append(child_path)
    for child_path in non_heads:
        # h == which <tree.category> counting from the bottom is the anchor
        h = [n.category for n in my_path].count(tree.category)
        paths.append((tree.category, h, child_path))
    return my_path, paths

def _collect_spines(tree):
    try:
        path, paths = _do_collect_spines(tree)
    except:
        print(tree.to_brackets())
        raise
    return {p[-1] : (anchor, h, p[:-1]) for anchor, h, p in [('<ROOT>', '<ROOT>', path)] + paths}

def _compress_spine(spine):
    compressed = []
    for category in spine:
        if category in compressed:
            assert(category == compressed[-1])
        else:
            compressed.append(category)
    return compressed

def _add_spines_and_attachments(instance, dataset_features, compress, NER=False):
    tree = tree_from_dataset_instance(instance, dataset_features, NER=NER)

    new_columns = [NE_SPINES, NE_ANCHORS, NE_ANCHOR_HS] if NER else [SPINES, ANCHORS, ANCHOR_HS]
    spines_col, anchors_col, anchor_hs_col = new_columns

    if tree is None:
        return {
            spines_col : [None for _ in instance[TOKENS]],
            anchors_col : [None for _ in instance[TOKENS]],
            anchor_hs_col : [None for _ in instance[TOKENS]],
        }

    spines = _collect_spines(tree)
    leafs_linear = sorted(tree.get_yield(), key=lambda leaf: leaf.from_index)
    rows = []
    for leaf in leafs_linear:
        anchor, anchor_h, spine = spines[leaf]
        spine = [node.category for node in spine]
        if compress:
            spine = _compress_spine(spine)
        spine = '_'.join(spine) if spine else EMPTY
        rows.append((spine, anchor, str(anchor_h)))
    spines, anchors, anchor_hs = zip(*rows)
    return {
        spines_col : spines,
        anchors_col : anchors,
        anchor_hs_col : anchor_hs,
    }

def add_spines_and_attachments(dataset, compress=False, NER=False):
    dataset_features = dataset['train'].features
    new_dataset = dataset.map(
        lambda instance: _add_spines_and_attachments(instance, dataset_features, compress=compress, NER=NER))
    new_columns = [NE_SPINES, NE_ANCHORS, NE_ANCHOR_HS] if NER else [SPINES, ANCHORS, ANCHOR_HS]
    return cast_labels(new_dataset, new_columns)

def _preprocess_ne_heads(instance, root_value, empty_value):
    heads = instance[NE_HEADS]
    spines = instance[NE_SPINES]
    anchors = instance[NE_ANCHORS]
    new_heads = [None if s == empty_value and a == root_value else h for h, s, a in zip(heads, spines, anchors)]
    return {
        NE_HEADS : new_heads,
    }

def preprocess_ne_heads(dataset):
    root_value = dataset['train'].features[NE_ANCHORS].feature.str2int('ROOT')
    empty_value = dataset['train'].features[NE_SPINES].feature.str2int(EMPTY)
    return dataset.map(lambda instance: _preprocess_ne_heads(instance, root_value, empty_value))

EDGE, NO_EDGE = 1, 0

def _add_adjacency_matrix(instance, NER=False):
    matrix_col = NE_ADJACENCY_MATRIX if NER else ADJACENCY_MATRIX

    heads = instance[NE_HEADS if NER else HEADS]

    if set(heads) == {MASK_VALUE}:
        return {matrix_col : [[MASK_VALUE for j in range(len(heads))] for i in range(len(heads))]}

    # ROOT is ‘it’s own’ head
    heads = [x if x is not None else i for i, x in enumerate(heads)]
    am = [[NO_EDGE for j in range(len(heads))] for i in range(len(heads))]
    for i, (token, head) in enumerate(zip(instance[TOKENS], heads)):
        am[i][head] = EDGE
    return {matrix_col : am}

def add_adjacency_matrix(dataset, NER=False):
    return dataset.map(lambda instance: _add_adjacency_matrix(instance, NER=NER))

# https://huggingface.co/docs/transformers/v4.23.1/en/tasks/token_classification

def masked_word_ids(word_ids, masking_strategy=FIRST):
    masked = []
    for i, word_idx in enumerate(word_ids):
        # Set the label for the first/last token of each word.
        # Mask the label for:
        #   * special tokens (word id = None)
        #   * other tokens in a word
        if word_idx is None:
            masked.append(None)
        else:
            if masking_strategy == FIRST:
                masked.append(word_idx if word_idx != word_ids[i - 1] else None)
            elif masking_strategy == LAST:
                masked.append(word_idx if word_idx != word_ids[i + 1] else None)
    return masked

def _align_row(values, masked_word_ids):
    try:
        return [MASK_VALUE if idx is None else values[idx] for idx in masked_word_ids]
    except:
        print(values)
        print(masked_word_ids)
        raise

def _align_example(example, masked_ids):

    column_names = list(example.keys())
    labels = defaultdict(list)
    masked_row = [MASK_VALUE for x in masked_ids]

    try:

        for column_name in column_names:
            if column_name in (TOKENS, LEMMAS):
                continue
            values = example[column_name]
            if type(values) == str:
                continue
            matrix = hasattr(values[0], '__iter__')
            if matrix:
                aligned_labels = [_align_row(values[idx], masked_ids) if idx is not None else masked_row for idx in masked_ids]
            else:
                aligned_labels = _align_row(example[column_name], masked_ids)
            labels[column_name] = aligned_labels
    except:
        print(example, masked_ids)
        raise

    return labels

def morf_tokenize(text, m):
    segs = dict()
    max_j = 0
    for i, j, interp in m.analyse(text):
        orth = interp[0]
        if (i, j) in segs:
            assert (orth == segs[(i, j)])
        else:
            segs[(i, j)] = orth
        max_j = max(max_j, j)
    return [segs[(i, i + 1)] for i in range(max_j)]

def _morf_tokenize_and_align(example, morfeusz, masking_strategy=FIRST):

    if masking_strategy not in (FIRST, LAST):
        raise RuntimeError(f'Uknown masking strategy: {masking_strategy}')
    if masking_strategy == LAST:
        raise RuntimeError(f'Can’t use {masking_strategy} masking strategy with retokenize')

    labels = defaultdict(list)

    mask = []
    for i, token in enumerate(example[TOKENS]):
        for j, morf_token in enumerate(morf_tokenize(token, morfeusz)):
            labels[TOKENS].append(morf_token)
            labels[SEGS].append(SEG_BEGIN if j == 0 else SEG_INSIDE)
            mask.append(i if j == 0 else None)

    labels.update(_align_example(example, mask))
    return labels

def morfeusz_retokenize(dataset, masking_strategy=FIRST):
    morfeusz = Morfeusz(generate=False)
    print(f'retokenizing using {morfeusz.dict_id()}')
    new_dataset = dataset.map(lambda x: _morf_tokenize_and_align(x, morfeusz, masking_strategy=masking_strategy))
    return cast_labels(new_dataset, [SEGS])

def bert_tokenize_and_align(example, tokenizer, masking_strategy=FIRST):

    if masking_strategy not in (FIRST, LAST):
        raise RuntimeError(f'Uknown masking strategy: {masking_strategy}')

    tokenized_inputs = tokenizer(example[TOKENS], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    mask = masked_word_ids(word_ids, masking_strategy)
    labels = _align_example(example, mask)
    tokenized_inputs.update(labels)
    return tokenized_inputs

'''
    def _remove_columns(self, dataset):
        to_keep = ['id', TOKENS] + self.categories + self.categories2d
        columns_to_remove = [col for col in dataset.column_names if col not in to_keep]
        return dataset.remove_columns(columns_to_remove)

# TODO for 2d categories!!!
    def _unify_signatures(self, datasets):
        if not datasets:
            return None
        datasets = [self._remove_columns(dataset) for dataset in datasets]
        if len(datasets) == 1:
            return datasets
        print('unifying datasets:')
        for dataset in datasets:
            print(len(dataset), 'examples')
        for category in self.categories: #TODO!!! + self.categories2d:
            values = set()
            for dataset in datasets:
                if category in dataset.features:
                    feature = dataset.features[category].feature
                    if type(feature) == ClassLabel:
                        values.add(tuple(dataset.features[category].feature.names))
                    else:
                        print(type(feature))
                        1/0
                        values.add('VALUE')
            if len(values) > 1:
                print(f'{category}: aligning labels')
                mapping = {value : i for i, value in enumerate(sorted(set(chain.from_iterable(values))))}
                datasets = [dataset.align_labels_with_mapping(mapping, category) for dataset in datasets]
        return datasets

    def _join_datasets(self, datasets):
        if not datasets:
            return None
        if self.segmentation:
            datasets = [self._retokenize_dataset(d) for d in datasets]
        datasets = self._unify_signatures(datasets)
        if len(datasets) == 1:
            return datasets[0]
        print('joining datasets:')
        for dataset in datasets:
            print(len(dataset), 'examples')
        joined = concatenate_datasets(datasets)
        print('result:', len(joined), 'examples')
        if self.segmentation:
            joined = self._retokenize_dataset(joined)
        return joined
''';