Source code for sotastream.filters.filters

import sys
import re
import logging

logger = logging.getLogger(f"sotastream")



[docs]
def SkipBlanks(lines, fields=[0, 1]):
    """
    Skips lines that are blank in any of the requested fields.
    Also zeroes out the third field if present (to reset docid).
    This is important for training document models, where a blank field can teach the model
    to drop / add sentences.

    :param lines: The data stream
    :param fields: fields to check for blankness
    """
    skipped_prev = False
    for line in lines:
        for fieldno in fields:
            if fieldno >= len(line) or line[fieldno] is None or line[fieldno] == "":
                skipped_prev = True
                break
        else:
            # If we skipped the previous line, we invalidate the current document ID
            if skipped_prev and len(fields) >= 3:
                fields[2] = 0
            skipped_prev = False

            yield line




[docs]
def BitextFilter(lines, end_range=2):
    """
    Removes all fields up to end_range.

    :param lines: the stream of input lines
    :param end_range: One higher than the last 0-index field number that should be included.
    """
    for line in lines:
        line.fields = line.fields[0:end_range]
        yield line




[docs]
def MatchFilter(lines, pattern=r'[\=\+\#\@\^\~\<\>]', fields=[0, 1], invert=False):
    for line in lines:
        if len(line) < 2:
            logger.debug(f"MatchFilter: bad line: {line}")
            continue

        if len(fields) != 2:
            raise IndexError("need to specify two field indices for matching")

        f1 = line[fields[0]]
        f2 = line[fields[1]]

        criterion = sorted(re.findall(pattern, f1)) == sorted(re.findall(pattern, f2))
        if (not invert and criterion) or (invert and not criterion):
            yield line




[docs]
def RegexFilter(lines, pattern, fields=[0, 1], invert=False):
    """
    Removes a line if the pattern is found in one or more fields.
    """
    regex = re.compile(pattern)
    for line in lines:
        if len(line) < len(fields):
            logger.debug(f"RegexFilter: bad line: {line}")
            continue

        founds = [regex.search(line[field]) for field in fields]
        if (not invert and not any(founds)) or (invert and all(founds)):
            yield line