cleaner.py 15.1 KB

Edit Raw Blame History

"""Read Markdown file converted from ODT and fix as many errors as possible.
This uses heuristics to fix markdown, punctuation and even common misspellings.
"""
import re


class Cleaner(object):
    'The class to clean Markdown'

    def __init__(self):
        self.verbose = ''

    def clean(self, filename, output):
        ' Do all the work '
        pars = self._read(filename)
        pars = self._merge_titles(pars)
        pars = self._clear_odt(pars)
        pars = self._merge_pars(pars)
        pars = self._trim_interpelations(pars)
        pars = self._fix_speakers(pars)
        pars = self._fix_comments(pars)
        # if self.args.mode != 'senate':
        # pars = self._fix_senate(pars)
        # if self.args.text:
        pars = self._simplify_formatting(pars)
        self.save(pars, output)
        # if self.args.speakers:
        # self.extractSpeakers(filename, pars)

    def _read(self, filename):
        'Read data from file. Merge lines into paragraphs using empty line as separator'
        pars = []
        current = ''
        for line in open(filename):
            line = self._fix_punctuation(line).strip()
            if line:
                if current.endswith('- ') and not current.endswith('o- '):
                    current = current[:-2]
                current += line + ' '
            else:
                pars.append(current.strip())
                current = ''
        if current.strip():
            pars.append(current.strip())
        return pars

    def _merge_titles(self, pars):
        'Merge split speakers with title in the first line and name in the second'
        fixed = []
        for par in pars:
            if par.startswith('**') and fixed and ':' not in fixed[-1] and fixed[-1].endswith('**'):
                fixed[-1] = (fixed[-1][:-2] + ' ' + par[2:]).replace('  ', ' ')
            else:
                fixed.append(par)
        return fixed

    def _fix_punctuation(self, line):
        'Fix space punctuation and mdash'
        line = re.sub(r'\s\s', ' ', line)
        line2 = line
        replacements = {
            ' ': ' ',
            ' ,,': ' „',
            ',, ': ', ',
            ' .': ' ',
            '"': '”',
            '---': '—',
            '( ': '(',
            ' )': ')',
        }
        for pattern, replacement in replacements.items():
            line = line.replace(pattern, replacement)
        # Fix incorrect spacing around hyphens: biało- czerwony -> biało-czerwony
        line = re.sub(
            r'([a-ząćęłńóśżź]{3}[wknłcz]o)-\s+([a-zćłóśżź]{4})', r'\\1-\\2', line)
        # Fix minus used as hyphen: poru- cznik -> porucznik
        line = re.sub(
            r'([a-ząćęłńośżź])-\s+([a-ząćęłńóśżź]{2})', u'\\1\\2', line)
        # Fix hyphens
        line = re.sub(u' ([-–—]*)([^ -–—])', u' — \\2', line)
        if self.verbose == 'line' and line != line2:
            print(r'DIFFERENT\n{}\n{}\n'.format(line2, line))
        return line

    def _clear_odt(self, pars):
        'Remove leftovers from OpenOffice: bookmarks, footnotes, hyphenation, backslashes'
        removed = [r'[]', '\\', '{.Apple-converted-space}', r'(—)']
        reremoved = [r'\s*\u00ad\s*', '{#anchor-?[0-9]*}', r'\{[0-9s.]*\}',
                     r'\[\W]*\]', r'\[\s*\]']
        cleared = []
        for par in pars:
            for remove in removed:
                par = par.replace(remove, '')
            for remove in reremoved:
                par = re.sub(remove, '', par)
            if '*' in par:
                par = self._fix_markdown(par)
            if re.search(r'\S\s\S\s\S\s\S\s\S\s\S\s\S', par):
                par = self._fix_spaceout(par)
            cleared.append(par.strip())
        return cleared

    def _fix_spaceout(self, par):
        'Try to merge spaced out words'
        oldpar = par
        match = re.search(r'\s\S\s\S\s\S\s\S\s\S\s\S\s', par)
        if match:
            start = match.start()
            end = start
            while end < len(par) - 1 and par[end] == ' ':
                end += 2
            spaceout = re.sub('(.)([A-ZĆŁÓŚŻŹ])', '\\1 \\2',
                              par[start:end].replace(' ', ''))
            par = par[:start] + spaceout + par[end:]
            if self.verbose == 'spaceout':
                print(f'{oldpar}\n{par}\n')
        return par

    def _fix_markdown(self, par):
        'Simplify markdown asterisk. Fix spaces adjacent to asterisk'
        oldpar = par
        # Replace redundant asterisks
        par = re.sub(r'\*\*([-=.]?)\*\*', '\\1', par)
        for mark in ['**', '*']:
            start = par.find(mark)
            if start == -1 or start >= len(par) - len(mark) or par[start+len(mark)] == '*':
                continue
            while start > 0 and not par[start-1].isspace():
                start -= 1
            last = par.rfind(mark)
            if last <= start:
                continue
            if par[last-1] == ' ':
                par = par[:start] + mark + par[start:last -
                                               1].replace('*', '') + mark + ' ' + par[last+len(mark):]
            else:
                while (last < len(par) and par[last] != ' '):
                    last += 1
                par = par[:start] + mark + \
                    par[start:last].replace('*', '') + mark + par[last:]
        par = re.sub(r'\*\*(\s*)\*\*', '\\1', par, 0, re.MULTILINE)
        # Add trailing * for starting * + bracket
        par = re.sub(r'\s\*\(([^)]*)\)\.?\s+\*?', lambda m: ' *(' +
                     m.group(1).replace('*', '') + ')* ', par)
        # Remove comment with no colon/bracket - some texts contain spurious italic
        par = re.sub(
            r'\*+([A-Za-ząćęłńóśźż ][A-Za-ząćęłńóśźżŚĆŁŻŹ !?-]*)\*+', '\\1', par)
        if self.verbose == 'markdown' and par != oldpar:
            print('OLD: {oldpar}\nNEW: {par}\n')
        return par

    def _fix_speakers(self, pars):
        'Heuristics to remove spurious speaker formatting and to format some unformatted speakers'
        fixed = []
        for par in pars:
            oldpar = par
            if '**' in par:
                # Remove ** around single words
                par = re.sub(
                    r'\*\*([a-z0-9ąćęłńóśżś%,.:/\s-]*)\*\*', '\\1', par)
                # Remove ** not at the start of the line
                if '**' in par and not par.startswith('**'):
                    par = par.replace('**', '')
                # Move ** after the colon
                par = re.sub(
                    r'^\*\*([^*:]*):\s+([^*]*)\s*\*\*', '**\\1:** \\2', par)
                # Remove ** if the content does not look like person name
                match = re.match(r'^\*\*([^\*]*):?\*\*', par)
                if not match or not self._can_be_person(match.group(1)):
                    par = self._find_speakers(par.replace('**', ''))
                    if self.verbose == 'speakers':
                        print('NOT SPEAKER ', par)
            else:
                par = self._find_speakers(par)
            fixed.append(par)
            if self.verbose == 'speakers' and oldpar != par:
                if par.startswith('**'):
                    print(f'SPEAKER: {par}')
                else:
                    print(f'CLEANED: {par}')
        return fixed

    def _find_speakers(self, par):
        'Add ** around unformatted speakers if applicable'
        match = re.match(
            r'([A-Z][a-ząćęłńóśżś.]+\s+[A-Z][a-ząćęłńóśżź]+\s+[A-Z][A-Za-ząćęłńóśźżŚĆŁŻŹ-]+)[.:\s]*$', par)
        if match:
            if self._can_be_person(match.group(1)):
                return f'**{match.group(1)}:**'
            else:
                return par
        match = re.match(
            r'([A-Z][a-ząćęłńóśźż.]+\s+[A-Z][A-Za-ząćęłńóśźżŚĆŁŻŹ-]+):\s+(.*)$', par)
        if match:
            return f'**{match.group(1)}:**\n\n{match.group(2)}'
        match = re.match(
            r'([A-Z][a-ząćęłńóśźż.]+\s+[A-Z][A-Za-ząćęłńóśźżŚĆŁŻŹ-]+\s+[A-Z][A-Za-ząćęłńóśźżŚĆŁŻŹ-]+):\s+(.*)$', par)
        if match:
            return f'**{match.group(1)}:**\n\n{match.group(2)}'
        if par == u'Marszałek':
            return u'**Marszałek:**'
        return par

    def _fix_comments(self, pars):
        """ Add italic to some comments not formatted in the original file
        They are assumed to start after a dot, be in brackets and start with uppercase.
        Fragments with numbers are excluded to avoid formatting legal references.
        Also fix some comment formatting.
        """
        fixed = []
        for par in pars:
            oldpar = par
            # Fix comments like '*(Part of* comment)'
            par = par.replace('(*', '*(')
            par = re.sub(r'\*\(([^\)]*)\*([^\)]*)\)', '*(\\1\\2)*', par)
            # Add missing comments
            par = re.sub(r'\. \(([A-ZĆŁŚŻŹ][^0-9)]*)\)\.?(\s|$|\*)',
                         lambda m: '. *(' + m.group(1).replace('*', '') + ')* ', par)
            # Fix comments marked as new speaker
            not_speaker = re.match(r'\*\*\((.*)\)\.?\s*\*\*$', par)
            if not_speaker:
                par = '*(' + not_speaker.group(1) + ')*'
            if self.verbose == 'comments' and oldpar != par:
                print(f'COMMENT: {format(par)}')
            fixed.append(par)
        return fixed

    def _fix_senate(self, pars):
        fixed = []
        header = False
        for par in pars:
            if not header and not par.startswith('**'):
                continue
            header = True
            if par.startswith('[]'):
                continue
            elif re.match(r'\[\d+\]?$', par):
                continue
            elif re.match(r'\*\(Początek posiedzenia o godzinie \d+ minut \d+\)\*', par):
                continue
            elif re.match(r'[0-9]+\. posiedzenie [\wąćęłńóśżźŹŻŚŁ\s,]+$', par):
                continue
            elif re.match(r'[0-9]+$', par):
                continue
            elif re.match(r'w dniu [0-9]+ \S+ [0-9]{4} r\.$', par):
                continue
            if re.match(r'[0-9]+\. posiedzenie .*\*', par):
                fixed.append(re.sub(r'^[^*]*(\*.*)$', '\\1', par))
            else:
                fixed.append(par)
                if '*(Koniec posiedzenia' in par:
                    break
        return fixed

    def _can_be_person(self, person):
        'Check if given fragment can be a person. Used to remove spurious bold around some titles'
        if not 3 < len(person) < 151:
            return False
        for prefix in ['(', 'Obywatel', 'Wysoka', 'Proszę', 'Polski', 'Panie', 'Dziękuję',
                       'Sprawozdanie', 'Pan ', 'Pani ', 'Przystępuj']:
            if person.startswith(prefix):
                return False
        for suffix in ['ego']:
            if person.endswith(suffix):
                return False
        for infix in ['II', 'Sejm', 'rzystępujemy', 'nterpelacj', 'Warszawa']:
            if infix in person:
                return False
        if re.search('[0-9]', person):
            return False
        return True

    def _simplify_formatting(self, pars):
        'Remove repeated or nested formatting'
        text = []
        for par in pars:
            par = re.sub(r'\*\*[^\*]*\*\*', '', par)
            par = re.sub(r'\*[^\*]*\*', '', par)
            par = par.strip()
            if par:
                text.append(par)
        return text

    def _merge_pars(self, pars):
        """Heuristic merging of paragraphs. Adds weights suggesting it IS a new paragraph.
        If the total weight is below the threshold, the paragraph is merged.
        """
        merged = pars[:1]
        for par in pars[1:]:
            if not par:
                continue
            previous = merged[-1]
            eol = 0
            if previous.endswith(('!', '?', '.', ':', '"', ';')):
                eol += 14
            if '*' in previous:
                eol += 5
            if len(previous) < 60:
                eol += 5
            if '**' in par and len(par) < 80:
                eol += 16
            if previous.endswith('**'):
                eol += 17
            if par.startswith('--'):
                eol += 12
            if par[0].isupper():
                eol += 5
            if par.startswith('**') and len(par) > 2 and par[2].isupper():
                eol += 17
            if re.search(r' [a-z] [a-z] [a-z] [a-z] ', previous):
                eol += 11
            if eol >= 10:
                if self.verbose == 'merge' and par[0].islower():
                    print(f'[{eol}]: {merged[-1]}\nDO NOT MERGE WITH: {par}\n')
                merged.append(par)
            else:
                if self.verbose == 'merge':
                    print(f'{merged[-1]}\nMERGE WITH: {par}\n')
                if merged[-1].endswith('-'):  # Remove trailing hyphen when merging
                    merged[-1] = merged[-1][:-1] + par
                else:
                    merged[-1] = merged[-1] + ' ' + par
        return merged

    def _trim_interpelations(self, pars):
        'Remove interpelations'
        trimmed = []
        ignored = False
        for par in pars:
            if par.replace('*', '').startswith(u'Odpowiedź'):  # and not self.interpellations
                ignored = True
                if self.verbose == 'interpellations':
                    print('\n**Interpellations**')
            if not ignored:
                trimmed.append(par)
            elif self.verbose == 'interpellations':
                print(par)
        return trimmed

    @staticmethod
    def save(pars, path):
        'Write the content to given file'
        with open(path, mode='w') as out:
            for par in pars:
                out.write(par)
                out.write(u'\n\n')

    # def extractSpeakers(self, filename, pars):
        # 'Prepare a list of unique speaker names for checking'
        # self.speakers.write(r'[{}]\n'.format(filename))
        # speakers = set()
        # for par in pars:
        #     match = re.match(r'^\*\*([^\*:]*)', par)
        #     if match:
        #         speakers.add(match.group(1))
        # speakers = list(speakers)
        # speakers.sort()
        # for speaker in speakers:
        #     self.speakers.write(speaker + u'\n')
        # self.speakers.write(u'\n')


# Read options
# parser = argparse.ArgumentParser()
# parser.add_argument('-o', '--output', help='output directory', default='out')
# parser.add_argument('-i', '--interpellations',
#                     help='include interpellations', action='store_true')
# parser.add_argument('-x', '--speakers',
#                     help='extract speakers', action='store_true')
# parser.add_argument('-t', '--text', help='remove metatext',
#                     action='store_true')
# parser.add_argument('-v', '--verbose', help='show messages', default='none',
#                     choices=['none', 'line', 'markdown', 'merge', 'spaceout',
#                              'interpellations', 'speakers', 'comments'])
# parser.add_argument('-m', '--mode', help='special mode', default='default',
#                     choices=['default', 'senate'])
# parser.add_argument('filename', help='file to process', nargs='+')
# args = parser.parse_args()