Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5dde0878 authored by Roozbeh Pournader's avatar Roozbeh Pournader
Browse files

Add many more emoji tests

This creates an exact map of the emoji font and adds the following
tests:
* Test that all 'emoji style' variation sequences exist in the color
  emoji font.
* Test that the emoji font contains the exact same character set that
  we expect.
* Test that all unsupported flags map to the same flag tofu glyph.
* Test that the reverse of ZWJ sequences exist in the font and map to
  the same glyph.
* Test that legacy PUA Android emoji are supported.
* Test that all emoji sequences that should map to the same glyph do
  so, but no other sequences map to the same glyph.

Bug: 26187231
Change-Id: I64ef17862df2d91879bf7904055c77c24abb7d88
parent 409635b2
Loading
Loading
Loading
Loading
+291 −33
Original line number Diff line number Diff line
#!/usr/bin/env python

import collections
import copy
import glob
import itertools
from os import path
import sys
from xml.etree import ElementTree

from fontTools import ttLib

EMOJI_VS = 0xFE0F

LANG_TO_SCRIPT = {
    'as': 'Beng',
    'bn': 'Beng',
@@ -57,13 +61,26 @@ def lang_to_script(lang_code):
    return LANG_TO_SCRIPT[lang]


def get_best_cmap(font):
def printable(inp):
    if type(inp) is set:  # set of character sequences
        return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
    if type(inp) is tuple:  # character sequence
        return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
    else:  # single character
        return 'U+%04X' % inp


def open_font(font):
    font_file, index = font
    font_path = path.join(_fonts_dir, font_file)
    if index is not None:
        ttfont = ttLib.TTFont(font_path, fontNumber=index)
        return ttLib.TTFont(font_path, fontNumber=index)
    else:
        ttfont = ttLib.TTFont(font_path)
        return ttLib.TTFont(font_path)


def get_best_cmap(font):
    ttfont = open_font(font)
    all_unicode_cmap = None
    bmp_cmap = None
    for cmap in ttfont['cmap'].tables:
@@ -79,6 +96,52 @@ def get_best_cmap(font):
    return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap


def get_variation_sequences_cmap(font):
    ttfont = open_font(font)
    vs_cmap = None
    for cmap in ttfont['cmap'].tables:
        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
        if specifier == (14, 0, 5):
            assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
            vs_cmap = cmap
    return vs_cmap


def get_emoji_map(font):
    # Add normal characters
    emoji_map = copy.copy(get_best_cmap(font))
    reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}

    # Add variation sequences
    vs_dict = get_variation_sequences_cmap(font).uvsDict
    for vs in vs_dict:
        for base, glyph in vs_dict[vs]:
            if glyph is None:
                emoji_map[(base, vs)] = emoji_map[base]
            else:
                emoji_map[(base, vs)] = glyph

    # Add GSUB rules
    ttfont = open_font(font)
    for lookup in ttfont['GSUB'].table.LookupList.Lookup:
        assert lookup.LookupType == 4, 'We only understand type 4 lookups'
        for subtable in lookup.SubTable:
            ligatures = subtable.ligatures
            for first_glyph in ligatures:
                for ligature in ligatures[first_glyph]:
                    sequence = [first_glyph] + ligature.Component
                    sequence = [reverse_cmap[glyph] for glyph in sequence]
                    sequence = tuple(sequence)
                    # Make sure no starting subsequence of 'sequence' has been
                    # seen before.
                    for sub_len in range(2, len(sequence)+1):
                        subsequence = sequence[:sub_len]
                        assert subsequence not in emoji_map
                    emoji_map[sequence] = ligature.LigGlyph

    return emoji_map


def assert_font_supports_any_of_chars(font, chars):
    best_cmap = get_best_cmap(font)
    for char in chars:
@@ -101,6 +164,13 @@ def assert_font_supports_none_of_chars(font, chars):
            'U+%04X was found in %s' % (char, font))


def assert_font_supports_all_sequences(font, sequences):
    vs_dict = get_variation_sequences_cmap(font).uvsDict
    for base, vs in sorted(sequences):
        assert vs in vs_dict and (base, None) in vs_dict[vs], (
            '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))


def check_hyphens(hyphens_dir):
    # Find all the scripts that need automatic hyphenation
    scripts = set()
@@ -119,6 +189,16 @@ def check_hyphens(hyphens_dir):
            assert_font_supports_any_of_chars(font, HYPHENS)


class FontRecord(object):
    def __init__(self, name, scripts, variant, weight, style, font):
        self.name = name
        self.scripts = scripts
        self.variant = variant
        self.weight = weight
        self.style = style
        self.font = font


def parse_fonts_xml(fonts_xml_path):
    global _script_to_font_map, _fallback_chain
    _script_to_font_map = collections.defaultdict(set)
@@ -159,7 +239,7 @@ def parse_fonts_xml(fonts_xml_path):
            if index:
                index = int(index)

            _fallback_chain.append((
            _fallback_chain.append(FontRecord(
                name,
                frozenset(scripts),
                variant,
@@ -175,39 +255,72 @@ def parse_fonts_xml(fonts_xml_path):
                _script_to_font_map[script].add((font_file, index))


def check_emoji_availability():
    emoji_fonts = [font[5] for font in _fallback_chain if 'Zsye' in font[1]]
def check_emoji_coverage(all_emoji, equivalent_emoji):
    emoji_fonts = [
        record.font for record in _fallback_chain
        if 'Zsye' in record.scripts]
    assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
    emoji_font = emoji_fonts[0]
    emoji_chars = _emoji_properties['Emoji']
    assert_font_supports_all_of_chars(emoji_font, emoji_chars)
    coverage = get_emoji_map(emoji_font)

    for sequence in all_emoji:
        assert sequence in coverage, (
            '%s is not supported in the emoji font.' % printable(sequence))

def check_emoji_defaults():
    default_emoji_chars = _emoji_properties['Emoji_Presentation']
    missing_text_chars = _emoji_properties['Emoji'] - default_emoji_chars
    for sequence in coverage:
        if sequence in {0x0000, 0x000D, 0x0020}:
            # The font needs to support a few extra characters, which is OK
            continue
        assert sequence in all_emoji, (
            'Emoji font should not support %s.' % printable(sequence))

    for first, second in sorted(equivalent_emoji.items()):
        assert coverage[first] == coverage[second], (
            '%s and %s should map to the same glyph.' % (
                printable(first),
                printable(second)))

    for glyph in set(coverage.values()):
        maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
        if len(maps_to_glyph) > 1:
            # There are more than one sequences mapping to the same glyph. We
            # need to make sure they were expected to be equivalent.
            equivalent_seqs = set()
            for seq in maps_to_glyph:
                equivalent_seq = seq
                while equivalent_seq in equivalent_emoji:
                    equivalent_seq = equivalent_emoji[equivalent_seq]
                equivalent_seqs.add(equivalent_seq)
            assert len(equivalent_seqs) == 1, (
                'The sequences %s should not result in the same glyph %s' % (
                    printable(equivalent_seqs),
                    glyph))


def check_emoji_defaults(default_emoji):
    missing_text_chars = _emoji_properties['Emoji'] - default_emoji
    emoji_font_seen = False
    for name, scripts, variant, weight, style, font in _fallback_chain:
        if 'Zsye' in scripts:
    for record in _fallback_chain:
        if 'Zsye' in record.scripts:
            emoji_font_seen = True
            # No need to check the emoji font
            continue
        # For later fonts, we only check them if they have a script
        # defined, since the defined script may get them to a higher
        # score even if they appear after the emoji font.
        if emoji_font_seen and not scripts:
        if emoji_font_seen and not record.scripts:
            continue

        # Check default emoji-style characters
        assert_font_supports_none_of_chars(font, sorted(default_emoji_chars))
        assert_font_supports_none_of_chars(record.font, sorted(default_emoji))

        # Mark default text-style characters appearing in fonts above the emoji
        # font as seen
        if not emoji_font_seen:
            missing_text_chars -= set(get_best_cmap(font))
            missing_text_chars -= set(get_best_cmap(record.font))

    # Noto does not have monochrome symbols for Unicode 7.0 wingdings and
    # webdings
    # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
    # webdings yet.
    missing_text_chars -= _chars_by_age['7.0']
    # TODO: Remove these after b/26113320 is fixed
    missing_text_chars -= {
@@ -236,31 +349,175 @@ def parse_unicode_datafile(file_path, reverse=False):
            line = line.strip()
            if not line:
                continue
            char_range, prop = line.split(';')
            char_range = char_range.strip()

            chars, prop = line.split(';')
            chars = chars.strip()
            prop = prop.strip()
            if '..' in char_range:
                char_start, char_end = char_range.split('..')
            else:
                char_start = char_end = char_range

            if ' ' in chars:  # character sequence
                sequence = [int(ch, 16) for ch in chars.split(' ')]
                additions = [tuple(sequence)]
            elif '..' in chars:  # character range
                char_start, char_end = chars.split('..')
                char_start = int(char_start, 16)
                char_end = int(char_end, 16)
            char_range = xrange(char_start, char_end+1)
                additions = xrange(char_start, char_end+1)
            else:  # singe character
                additions = [int(chars, 16)]
            if reverse:
                output_dict[prop].update(char_range)
                output_dict[prop].update(additions)
            else:
                for char in char_range:
                    assert char not in output_dict
                    output_dict[char] = prop
                for addition in additions:
                    assert addition not in output_dict
                    output_dict[addition] = prop
    return output_dict


def parse_standardized_variants(file_path):
    emoji_set = set()
    text_set = set()
    with open(file_path) as datafile:
        for line in datafile:
            if '#' in line:
                line = line[:line.index('#')]
            line = line.strip()
            if not line:
                continue
            sequence, description, _ = line.split(';')
            sequence = sequence.strip().split(' ')
            base = int(sequence[0], 16)
            vs = int(sequence[1], 16)
            description = description.strip()
            if description == 'text style':
                text_set.add((base, vs))
            elif description == 'emoji style':
                emoji_set.add((base, vs))
    return text_set, emoji_set


def parse_ucd(ucd_path):
    global _emoji_properties, _chars_by_age
    global _text_variation_sequences, _emoji_variation_sequences
    global _emoji_sequences, _emoji_zwj_sequences
    _emoji_properties = parse_unicode_datafile(
        path.join(ucd_path, 'emoji-data.txt'), reverse=True)
    _chars_by_age = parse_unicode_datafile(
        path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
    sequences = parse_standardized_variants(
        path.join(ucd_path, 'StandardizedVariants.txt'))
    _text_variation_sequences, _emoji_variation_sequences = sequences
    _emoji_sequences = parse_unicode_datafile(
        path.join(ucd_path, 'emoji-sequences.txt'))
    _emoji_zwj_sequences = parse_unicode_datafile(
        path.join(ucd_path, 'emoji-zwj-sequences.txt'))


def flag_sequence(territory_code):
    return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)


UNSUPPORTED_FLAGS = frozenset({
    flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
    flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
    flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
    flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
    flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
    flag_sequence('WF'), flag_sequence('XK'), flag_sequence('YT'),
})

EQUIVALENT_FLAGS = {
    flag_sequence('BV'): flag_sequence('NO'),
    flag_sequence('CP'): flag_sequence('FR'),
    flag_sequence('HM'): flag_sequence('AU'),
    flag_sequence('SJ'): flag_sequence('NO'),
    flag_sequence('UM'): flag_sequence('US'),
}

COMBINING_KEYCAP = 0x20E3

LEGACY_ANDROID_EMOJI = {
    0xFE4E5: flag_sequence('JP'),
    0xFE4E6: flag_sequence('US'),
    0xFE4E7: flag_sequence('FR'),
    0xFE4E8: flag_sequence('DE'),
    0xFE4E9: flag_sequence('IT'),
    0xFE4EA: flag_sequence('GB'),
    0xFE4EB: flag_sequence('ES'),
    0xFE4EC: flag_sequence('RU'),
    0xFE4ED: flag_sequence('CN'),
    0xFE4EE: flag_sequence('KR'),
    0xFE82C: (ord('#'), COMBINING_KEYCAP),
    0xFE82E: (ord('1'), COMBINING_KEYCAP),
    0xFE82F: (ord('2'), COMBINING_KEYCAP),
    0xFE830: (ord('3'), COMBINING_KEYCAP),
    0xFE831: (ord('4'), COMBINING_KEYCAP),
    0xFE832: (ord('5'), COMBINING_KEYCAP),
    0xFE833: (ord('6'), COMBINING_KEYCAP),
    0xFE834: (ord('7'), COMBINING_KEYCAP),
    0xFE835: (ord('8'), COMBINING_KEYCAP),
    0xFE836: (ord('9'), COMBINING_KEYCAP),
    0xFE837: (ord('0'), COMBINING_KEYCAP),
}

ZWJ_IDENTICALS = {
    # KISS
    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
    # COUPLE WITH HEART
    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
    # FAMILY
    (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
}

def compute_expected_emoji():
    equivalent_emoji = {}
    sequence_pieces = set()
    all_sequences = set()
    all_sequences.update(_emoji_variation_sequences)

    for sequence in _emoji_sequences.keys():
        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
        all_sequences.add(sequence)
        sequence_pieces.update(sequence)

    for sequence in _emoji_zwj_sequences.keys():
        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
        all_sequences.add(sequence)
        sequence_pieces.update(sequence)
        # Add reverse of all emoji ZWJ sequences, which are added to the fonts
        # as a workaround to get the sequences work in RTL text.
        reversed_seq = tuple(reversed(sequence))
        all_sequences.add(reversed_seq)
        equivalent_emoji[reversed_seq] = sequence

    # Add all two-letter flag sequences, as even the unsupported ones should
    # resolve to a flag tofu.
    all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
    all_two_letter_codes = itertools.product(all_letters, repeat=2)
    all_flags = {flag_sequence(code) for code in all_two_letter_codes}
    all_sequences.update(all_flags)
    tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))

    all_emoji = (
        _emoji_properties['Emoji'] |
        all_sequences |
        sequence_pieces |
        set(LEGACY_ANDROID_EMOJI.keys()))
    default_emoji = (
        _emoji_properties['Emoji_Presentation'] |
        all_sequences |
        set(LEGACY_ANDROID_EMOJI.keys()))

    first_tofu_flag = sorted(tofu_flags)[0]
    for flag in tofu_flags:
        if flag != first_tofu_flag:
            equivalent_emoji[flag] = first_tofu_flag
    equivalent_emoji.update(EQUIVALENT_FLAGS)
    equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
    equivalent_emoji.update(ZWJ_IDENTICALS)
    for seq in _emoji_variation_sequences:
        equivalent_emoji[seq] = seq[0]

    return all_emoji, default_emoji, equivalent_emoji


def main():
@@ -278,8 +535,9 @@ def main():
    if check_emoji == 'true':
        ucd_path = sys.argv[3]
        parse_ucd(ucd_path)
        check_emoji_availability()
        check_emoji_defaults()
        all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
        check_emoji_coverage(all_emoji, equivalent_emoji)
        check_emoji_defaults(default_emoji)


if __name__ == '__main__':