Source code for confusable_homoglyphs.confusables

# -*- coding: utf-8 -*-
from .utils import load
from .categories import unique_aliases, alias


confusables_data = load('confusables.json')


[docs]class Found(Exception):
    pass


[docs]def is_mixed_script(string, allowed_aliases=['COMMON']):
    """Checks if ``string`` contains mixed-scripts content, excluding script
    blocks aliases in ``allowed_aliases``.

    E.g. ``B. C`` is not considered mixed-scripts by default: it contains characters
    from **Latin** and **Common**, but **Common** is excluded by default.

    >>> confusables.is_mixed_script('Abç')
    False
    >>> confusables.is_mixed_script('ρτ.τ')
    False
    >>> confusables.is_mixed_script('ρτ.τ', allowed_aliases=[])
    True
    >>> confusables.is_mixed_script('Alloτ')
    True

    :param string: A unicode string
    :type string: str
    :param allowed_aliases: Script blocks aliases not to consider.
    :type allowed_aliases: list(str)
    :return: Whether ``string`` is considered mixed-scripts or not.
    :rtype: bool
    """
    allowed_aliases = [a.upper() for a in allowed_aliases]
    cats = unique_aliases(string) - set(allowed_aliases)
    return len(cats) > 1


[docs]def is_confusable(string, greedy=False, preferred_aliases=[]):
    """Checks if ``string`` contains characters which might be confusable with
    characters from ``preferred_aliases``.

    If ``greedy=False``, it will only return the first confusable character
    found without looking at the rest of the string, ``greedy=True`` returns
    all of them.

    ``preferred_aliases=[]`` can take an array of unicode block aliases to
    be considered as your 'base' unicode blocks:

    -  considering ``paρa``,

       -  with ``preferred_aliases=['latin']``, the 3rd character ``ρ``
          would be returned because this greek letter can be confused with
          latin ``p``.
       -  with ``preferred_aliases=['greek']``, the 1st character ``p``
          would be returned because this latin letter can be confused with
          greek ``ρ``.
       -  with ``preferred_aliases=[]`` and ``greedy=True``, you'll discover
          the 29 characters that can be confused with ``p``, the 23
          characters that look like ``a``, and the one that looks like ``ρ``
          (which is, of course, *p* aka *LATIN SMALL LETTER P*).

    >>> confusables.is_confusable('paρa', preferred_aliases=['latin'])[0]['character']
    'ρ'
    >>> confusables.is_confusable('paρa', preferred_aliases=['greek'])[0]['character']
    'p'
    >>> confusables.is_confusable('Abç', preferred_aliases=['latin'])
    False
    >>> confusables.is_confusable('AlloΓ', preferred_aliases=['latin'])
    False
    >>> confusables.is_confusable('ρττ', preferred_aliases=['greek'])
    False
    >>> confusables.is_confusable('ρτ.τ', preferred_aliases=['greek', 'common'])
    False
    >>> confusables.is_confusable('ρττp')
    [{'homoglyphs': [{'c': 'p', 'n': 'LATIN SMALL LETTER P'}], 'alias': 'GREEK', 'character': 'ρ'}]

    :param string: A unicode string
    :type string: str
    :param greedy: Don't stop on finding one confusable character - find all of them.
    :type greedy: bool
    :param preferred_aliases: Script blocks aliases which we don't want ``string``'s characters
        to be confused with.
    :type preferred_aliases: list(str)
    :return: False if not confusable, all confusable characters and with what they are confusable
        otherwise.
    :rtype: bool or list
    """
    preferred_aliases = [a.upper() for a in preferred_aliases]
    outputs = []
    checked = set()
    for char in string:
        if char in checked:
            continue
        checked.add(char)
        char_alias = alias(char)
        if char_alias in preferred_aliases:
            # it's safe if the character might be confusable with homoglyphs from other
            # categories than our preferred categories (=aliases)
            continue
        found = confusables_data.get(char)
        # character λ is considered confusable if λ can be confused with a character from
        # preferred_aliases, e.g. if 'LATIN', 'ρ' is confusable with 'p' from LATIN.
        # if 'LATIN', 'Γ' is not confusable because in all the characters confusable with Γ,
        # none of them is LATIN.
        if preferred_aliases:
            potentially_confusable = []
            try:
                for d in found:
                    aliases = [alias(glyph) for glyph in d['c']]
                    for a in aliases:
                        if a in preferred_aliases:
                            potentially_confusable = found
                            raise Found()
            except Found:
                pass
        else:
            potentially_confusable = found
        if potentially_confusable:  # we found homoglyphs
            output = {
                'character': char,
                'alias': char_alias,
                'homoglyphs': potentially_confusable,
            }
            if not greedy:
                return [output]
            outputs.append(output)

    return outputs or False


[docs]def is_dangerous(string, preferred_aliases=[]):
    """Checks if ``string`` can be dangerous, i.e. is it not only mixed-scripts
    but also contains characters from other scripts than the ones in ``preferred_aliases``
    that might be confusable with characters from scripts in ``preferred_aliases``

    For ``preferred_aliases`` examples, see ``is_confusable`` docstring.

    >>> bool(confusables.is_dangerous('Allo'))
    False
    >>> bool(confusables.is_dangerous('AlloΓ', preferred_aliases=['latin']))
    False
    >>> bool(confusables.is_dangerous('Alloρ'))
    True
    >>> bool(confusables.is_dangerous('AlaskaJazz'))
    False
    >>> bool(confusables.is_dangerous('ΑlaskaJazz'))
    True

    :param string: A unicode string
    :type string: str
    :param preferred_aliases: Script blocks aliases which we don't want ``string``'s characters
        to be confused with.
    :type preferred_aliases: list(str)
    :return: Is it dangerous.
    :rtype: bool
    """
    return is_mixed_script(string) and is_confusable(string, preferred_aliases=preferred_aliases)