Loading tools/fonts/fontchain_lint.py +291 −33 Original line number Original line Diff line number Diff line #!/usr/bin/env python #!/usr/bin/env python import collections import collections import copy import glob import glob import itertools from os import path from os import path import sys import sys from xml.etree import ElementTree from xml.etree import ElementTree from fontTools import ttLib from fontTools import ttLib EMOJI_VS = 0xFE0F LANG_TO_SCRIPT = { LANG_TO_SCRIPT = { 'as': 'Beng', 'as': 'Beng', 'bn': 'Beng', 'bn': 'Beng', Loading Loading @@ -57,13 +61,26 @@ def lang_to_script(lang_code): return LANG_TO_SCRIPT[lang] return LANG_TO_SCRIPT[lang] def get_best_cmap(font): def printable(inp): if type(inp) is set: # set of character sequences return '{' + ', '.join([printable(seq) for seq in inp]) + '}' if type(inp) is tuple: # character sequence return '<' + (', '.join([printable(ch) for ch in inp])) + '>' else: # single character return 'U+%04X' % inp def open_font(font): font_file, index = font font_file, index = font font_path = path.join(_fonts_dir, font_file) font_path = path.join(_fonts_dir, font_file) if index is not None: if index is not None: ttfont = ttLib.TTFont(font_path, fontNumber=index) return ttLib.TTFont(font_path, fontNumber=index) else: else: ttfont = ttLib.TTFont(font_path) return ttLib.TTFont(font_path) def get_best_cmap(font): ttfont = open_font(font) all_unicode_cmap = None all_unicode_cmap = None bmp_cmap = None bmp_cmap = None for cmap in ttfont['cmap'].tables: for cmap in ttfont['cmap'].tables: Loading @@ -79,6 +96,52 @@ def get_best_cmap(font): return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap def get_variation_sequences_cmap(font): ttfont = open_font(font) vs_cmap = None for cmap in ttfont['cmap'].tables: specifier = (cmap.format, cmap.platformID, cmap.platEncID) if specifier == (14, 0, 5): assert vs_cmap is None, 'More than one VS cmap in %s' % (font, ) vs_cmap = cmap return vs_cmap def get_emoji_map(font): # Add normal characters emoji_map = copy.copy(get_best_cmap(font)) reverse_cmap = {glyph: code for code, glyph in emoji_map.items()} # Add variation sequences vs_dict = get_variation_sequences_cmap(font).uvsDict for vs in vs_dict: for base, glyph in vs_dict[vs]: if glyph is None: emoji_map[(base, vs)] = emoji_map[base] else: emoji_map[(base, vs)] = glyph # Add GSUB rules ttfont = open_font(font) for lookup in ttfont['GSUB'].table.LookupList.Lookup: assert lookup.LookupType == 4, 'We only understand type 4 lookups' for subtable in lookup.SubTable: ligatures = subtable.ligatures for first_glyph in ligatures: for ligature in ligatures[first_glyph]: sequence = [first_glyph] + ligature.Component sequence = [reverse_cmap[glyph] for glyph in sequence] sequence = tuple(sequence) # Make sure no starting subsequence of 'sequence' has been # seen before. for sub_len in range(2, len(sequence)+1): subsequence = sequence[:sub_len] assert subsequence not in emoji_map emoji_map[sequence] = ligature.LigGlyph return emoji_map def assert_font_supports_any_of_chars(font, chars): def assert_font_supports_any_of_chars(font, chars): best_cmap = get_best_cmap(font) best_cmap = get_best_cmap(font) for char in chars: for char in chars: Loading @@ -101,6 +164,13 @@ def assert_font_supports_none_of_chars(font, chars): 'U+%04X was found in %s' % (char, font)) 'U+%04X was found in %s' % (char, font)) def assert_font_supports_all_sequences(font, sequences): vs_dict = get_variation_sequences_cmap(font).uvsDict for base, vs in sorted(sequences): assert vs in vs_dict and (base, None) in vs_dict[vs], ( '<U+%04X, U+%04X> was not found in %s' % (base, vs, font)) def check_hyphens(hyphens_dir): def check_hyphens(hyphens_dir): # Find all the scripts that need automatic hyphenation # Find all the scripts that need automatic hyphenation scripts = set() scripts = set() Loading @@ -119,6 +189,16 @@ def check_hyphens(hyphens_dir): assert_font_supports_any_of_chars(font, HYPHENS) assert_font_supports_any_of_chars(font, HYPHENS) class FontRecord(object): def __init__(self, name, scripts, variant, weight, style, font): self.name = name self.scripts = scripts self.variant = variant self.weight = weight self.style = style self.font = font def parse_fonts_xml(fonts_xml_path): def parse_fonts_xml(fonts_xml_path): global _script_to_font_map, _fallback_chain global _script_to_font_map, _fallback_chain _script_to_font_map = collections.defaultdict(set) _script_to_font_map = collections.defaultdict(set) Loading Loading @@ -159,7 +239,7 @@ def parse_fonts_xml(fonts_xml_path): if index: if index: index = int(index) index = int(index) _fallback_chain.append(( _fallback_chain.append(FontRecord( name, name, frozenset(scripts), frozenset(scripts), variant, variant, Loading @@ -175,39 +255,72 @@ def parse_fonts_xml(fonts_xml_path): _script_to_font_map[script].add((font_file, index)) _script_to_font_map[script].add((font_file, index)) def check_emoji_availability(): def check_emoji_coverage(all_emoji, equivalent_emoji): emoji_fonts = [font[5] for font in _fallback_chain if 'Zsye' in font[1]] emoji_fonts = [ record.font for record in _fallback_chain if 'Zsye' in record.scripts] assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts) assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts) emoji_font = emoji_fonts[0] emoji_font = emoji_fonts[0] emoji_chars = _emoji_properties['Emoji'] coverage = get_emoji_map(emoji_font) assert_font_supports_all_of_chars(emoji_font, emoji_chars) for sequence in all_emoji: assert sequence in coverage, ( '%s is not supported in the emoji font.' % printable(sequence)) def check_emoji_defaults(): for sequence in coverage: default_emoji_chars = _emoji_properties['Emoji_Presentation'] if sequence in {0x0000, 0x000D, 0x0020}: missing_text_chars = _emoji_properties['Emoji'] - default_emoji_chars # The font needs to support a few extra characters, which is OK continue assert sequence in all_emoji, ( 'Emoji font should not support %s.' % printable(sequence)) for first, second in sorted(equivalent_emoji.items()): assert coverage[first] == coverage[second], ( '%s and %s should map to the same glyph.' % ( printable(first), printable(second))) for glyph in set(coverage.values()): maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph] if len(maps_to_glyph) > 1: # There are more than one sequences mapping to the same glyph. We # need to make sure they were expected to be equivalent. equivalent_seqs = set() for seq in maps_to_glyph: equivalent_seq = seq while equivalent_seq in equivalent_emoji: equivalent_seq = equivalent_emoji[equivalent_seq] equivalent_seqs.add(equivalent_seq) assert len(equivalent_seqs) == 1, ( 'The sequences %s should not result in the same glyph %s' % ( printable(equivalent_seqs), glyph)) def check_emoji_defaults(default_emoji): missing_text_chars = _emoji_properties['Emoji'] - default_emoji emoji_font_seen = False emoji_font_seen = False for name, scripts, variant, weight, style, font in _fallback_chain: for record in _fallback_chain: if 'Zsye' in scripts: if 'Zsye' in record.scripts: emoji_font_seen = True emoji_font_seen = True # No need to check the emoji font # No need to check the emoji font continue continue # For later fonts, we only check them if they have a script # For later fonts, we only check them if they have a script # defined, since the defined script may get them to a higher # defined, since the defined script may get them to a higher # score even if they appear after the emoji font. # score even if they appear after the emoji font. if emoji_font_seen and not scripts: if emoji_font_seen and not record.scripts: continue continue # Check default emoji-style characters # Check default emoji-style characters assert_font_supports_none_of_chars(font, sorted(default_emoji_chars)) assert_font_supports_none_of_chars(record.font, sorted(default_emoji)) # Mark default text-style characters appearing in fonts above the emoji # Mark default text-style characters appearing in fonts above the emoji # font as seen # font as seen if not emoji_font_seen: if not emoji_font_seen: missing_text_chars -= set(get_best_cmap(font)) missing_text_chars -= set(get_best_cmap(record.font)) # Noto does not have monochrome symbols for Unicode 7.0 wingdings and # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and # webdings # webdings yet. missing_text_chars -= _chars_by_age['7.0'] missing_text_chars -= _chars_by_age['7.0'] # TODO: Remove these after b/26113320 is fixed # TODO: Remove these after b/26113320 is fixed missing_text_chars -= { missing_text_chars -= { Loading Loading @@ -236,31 +349,175 @@ def parse_unicode_datafile(file_path, reverse=False): line = line.strip() line = line.strip() if not line: if not line: continue continue char_range, prop = line.split(';') char_range = char_range.strip() chars, prop = line.split(';') chars = chars.strip() prop = prop.strip() prop = prop.strip() if '..' in char_range: char_start, char_end = char_range.split('..') if ' ' in chars: # character sequence else: sequence = [int(ch, 16) for ch in chars.split(' ')] char_start = char_end = char_range additions = [tuple(sequence)] elif '..' in chars: # character range char_start, char_end = chars.split('..') char_start = int(char_start, 16) char_start = int(char_start, 16) char_end = int(char_end, 16) char_end = int(char_end, 16) char_range = xrange(char_start, char_end+1) additions = xrange(char_start, char_end+1) else: # singe character additions = [int(chars, 16)] if reverse: if reverse: output_dict[prop].update(char_range) output_dict[prop].update(additions) else: else: for char in char_range: for addition in additions: assert char not in output_dict assert addition not in output_dict output_dict[char] = prop output_dict[addition] = prop return output_dict return output_dict def parse_standardized_variants(file_path): emoji_set = set() text_set = set() with open(file_path) as datafile: for line in datafile: if '#' in line: line = line[:line.index('#')] line = line.strip() if not line: continue sequence, description, _ = line.split(';') sequence = sequence.strip().split(' ') base = int(sequence[0], 16) vs = int(sequence[1], 16) description = description.strip() if description == 'text style': text_set.add((base, vs)) elif description == 'emoji style': emoji_set.add((base, vs)) return text_set, emoji_set def parse_ucd(ucd_path): def parse_ucd(ucd_path): global _emoji_properties, _chars_by_age global _emoji_properties, _chars_by_age global _text_variation_sequences, _emoji_variation_sequences global _emoji_sequences, _emoji_zwj_sequences _emoji_properties = parse_unicode_datafile( _emoji_properties = parse_unicode_datafile( path.join(ucd_path, 'emoji-data.txt'), reverse=True) path.join(ucd_path, 'emoji-data.txt'), reverse=True) _chars_by_age = parse_unicode_datafile( _chars_by_age = parse_unicode_datafile( path.join(ucd_path, 'DerivedAge.txt'), reverse=True) path.join(ucd_path, 'DerivedAge.txt'), reverse=True) sequences = parse_standardized_variants( path.join(ucd_path, 'StandardizedVariants.txt')) _text_variation_sequences, _emoji_variation_sequences = sequences _emoji_sequences = parse_unicode_datafile( path.join(ucd_path, 'emoji-sequences.txt')) _emoji_zwj_sequences = parse_unicode_datafile( path.join(ucd_path, 'emoji-zwj-sequences.txt')) def flag_sequence(territory_code): return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code) UNSUPPORTED_FLAGS = frozenset({ flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'), flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'), flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'), flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'), flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'), flag_sequence('WF'), flag_sequence('XK'), flag_sequence('YT'), }) EQUIVALENT_FLAGS = { flag_sequence('BV'): flag_sequence('NO'), flag_sequence('CP'): flag_sequence('FR'), flag_sequence('HM'): flag_sequence('AU'), flag_sequence('SJ'): flag_sequence('NO'), flag_sequence('UM'): flag_sequence('US'), } COMBINING_KEYCAP = 0x20E3 LEGACY_ANDROID_EMOJI = { 0xFE4E5: flag_sequence('JP'), 0xFE4E6: flag_sequence('US'), 0xFE4E7: flag_sequence('FR'), 0xFE4E8: flag_sequence('DE'), 0xFE4E9: flag_sequence('IT'), 0xFE4EA: flag_sequence('GB'), 0xFE4EB: flag_sequence('ES'), 0xFE4EC: flag_sequence('RU'), 0xFE4ED: flag_sequence('CN'), 0xFE4EE: flag_sequence('KR'), 0xFE82C: (ord('#'), COMBINING_KEYCAP), 0xFE82E: (ord('1'), COMBINING_KEYCAP), 0xFE82F: (ord('2'), COMBINING_KEYCAP), 0xFE830: (ord('3'), COMBINING_KEYCAP), 0xFE831: (ord('4'), COMBINING_KEYCAP), 0xFE832: (ord('5'), COMBINING_KEYCAP), 0xFE833: (ord('6'), COMBINING_KEYCAP), 0xFE834: (ord('7'), COMBINING_KEYCAP), 0xFE835: (ord('8'), COMBINING_KEYCAP), 0xFE836: (ord('9'), COMBINING_KEYCAP), 0xFE837: (ord('0'), COMBINING_KEYCAP), } ZWJ_IDENTICALS = { # KISS (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F, # COUPLE WITH HEART (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491, # FAMILY (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A, } def compute_expected_emoji(): equivalent_emoji = {} sequence_pieces = set() all_sequences = set() all_sequences.update(_emoji_variation_sequences) for sequence in _emoji_sequences.keys(): sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) all_sequences.add(sequence) sequence_pieces.update(sequence) for sequence in _emoji_zwj_sequences.keys(): sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) all_sequences.add(sequence) sequence_pieces.update(sequence) # Add reverse of all emoji ZWJ sequences, which are added to the fonts # as a workaround to get the sequences work in RTL text. reversed_seq = tuple(reversed(sequence)) all_sequences.add(reversed_seq) equivalent_emoji[reversed_seq] = sequence # Add all two-letter flag sequences, as even the unsupported ones should # resolve to a flag tofu. all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)] all_two_letter_codes = itertools.product(all_letters, repeat=2) all_flags = {flag_sequence(code) for code in all_two_letter_codes} all_sequences.update(all_flags) tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys())) all_emoji = ( _emoji_properties['Emoji'] | all_sequences | sequence_pieces | set(LEGACY_ANDROID_EMOJI.keys())) default_emoji = ( _emoji_properties['Emoji_Presentation'] | all_sequences | set(LEGACY_ANDROID_EMOJI.keys())) first_tofu_flag = sorted(tofu_flags)[0] for flag in tofu_flags: if flag != first_tofu_flag: equivalent_emoji[flag] = first_tofu_flag equivalent_emoji.update(EQUIVALENT_FLAGS) equivalent_emoji.update(LEGACY_ANDROID_EMOJI) equivalent_emoji.update(ZWJ_IDENTICALS) for seq in _emoji_variation_sequences: equivalent_emoji[seq] = seq[0] return all_emoji, default_emoji, equivalent_emoji def main(): def main(): Loading @@ -278,8 +535,9 @@ def main(): if check_emoji == 'true': if check_emoji == 'true': ucd_path = sys.argv[3] ucd_path = sys.argv[3] parse_ucd(ucd_path) parse_ucd(ucd_path) check_emoji_availability() all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() check_emoji_defaults() check_emoji_coverage(all_emoji, equivalent_emoji) check_emoji_defaults(default_emoji) if __name__ == '__main__': if __name__ == '__main__': Loading Loading
tools/fonts/fontchain_lint.py +291 −33 Original line number Original line Diff line number Diff line #!/usr/bin/env python #!/usr/bin/env python import collections import collections import copy import glob import glob import itertools from os import path from os import path import sys import sys from xml.etree import ElementTree from xml.etree import ElementTree from fontTools import ttLib from fontTools import ttLib EMOJI_VS = 0xFE0F LANG_TO_SCRIPT = { LANG_TO_SCRIPT = { 'as': 'Beng', 'as': 'Beng', 'bn': 'Beng', 'bn': 'Beng', Loading Loading @@ -57,13 +61,26 @@ def lang_to_script(lang_code): return LANG_TO_SCRIPT[lang] return LANG_TO_SCRIPT[lang] def get_best_cmap(font): def printable(inp): if type(inp) is set: # set of character sequences return '{' + ', '.join([printable(seq) for seq in inp]) + '}' if type(inp) is tuple: # character sequence return '<' + (', '.join([printable(ch) for ch in inp])) + '>' else: # single character return 'U+%04X' % inp def open_font(font): font_file, index = font font_file, index = font font_path = path.join(_fonts_dir, font_file) font_path = path.join(_fonts_dir, font_file) if index is not None: if index is not None: ttfont = ttLib.TTFont(font_path, fontNumber=index) return ttLib.TTFont(font_path, fontNumber=index) else: else: ttfont = ttLib.TTFont(font_path) return ttLib.TTFont(font_path) def get_best_cmap(font): ttfont = open_font(font) all_unicode_cmap = None all_unicode_cmap = None bmp_cmap = None bmp_cmap = None for cmap in ttfont['cmap'].tables: for cmap in ttfont['cmap'].tables: Loading @@ -79,6 +96,52 @@ def get_best_cmap(font): return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap def get_variation_sequences_cmap(font): ttfont = open_font(font) vs_cmap = None for cmap in ttfont['cmap'].tables: specifier = (cmap.format, cmap.platformID, cmap.platEncID) if specifier == (14, 0, 5): assert vs_cmap is None, 'More than one VS cmap in %s' % (font, ) vs_cmap = cmap return vs_cmap def get_emoji_map(font): # Add normal characters emoji_map = copy.copy(get_best_cmap(font)) reverse_cmap = {glyph: code for code, glyph in emoji_map.items()} # Add variation sequences vs_dict = get_variation_sequences_cmap(font).uvsDict for vs in vs_dict: for base, glyph in vs_dict[vs]: if glyph is None: emoji_map[(base, vs)] = emoji_map[base] else: emoji_map[(base, vs)] = glyph # Add GSUB rules ttfont = open_font(font) for lookup in ttfont['GSUB'].table.LookupList.Lookup: assert lookup.LookupType == 4, 'We only understand type 4 lookups' for subtable in lookup.SubTable: ligatures = subtable.ligatures for first_glyph in ligatures: for ligature in ligatures[first_glyph]: sequence = [first_glyph] + ligature.Component sequence = [reverse_cmap[glyph] for glyph in sequence] sequence = tuple(sequence) # Make sure no starting subsequence of 'sequence' has been # seen before. for sub_len in range(2, len(sequence)+1): subsequence = sequence[:sub_len] assert subsequence not in emoji_map emoji_map[sequence] = ligature.LigGlyph return emoji_map def assert_font_supports_any_of_chars(font, chars): def assert_font_supports_any_of_chars(font, chars): best_cmap = get_best_cmap(font) best_cmap = get_best_cmap(font) for char in chars: for char in chars: Loading @@ -101,6 +164,13 @@ def assert_font_supports_none_of_chars(font, chars): 'U+%04X was found in %s' % (char, font)) 'U+%04X was found in %s' % (char, font)) def assert_font_supports_all_sequences(font, sequences): vs_dict = get_variation_sequences_cmap(font).uvsDict for base, vs in sorted(sequences): assert vs in vs_dict and (base, None) in vs_dict[vs], ( '<U+%04X, U+%04X> was not found in %s' % (base, vs, font)) def check_hyphens(hyphens_dir): def check_hyphens(hyphens_dir): # Find all the scripts that need automatic hyphenation # Find all the scripts that need automatic hyphenation scripts = set() scripts = set() Loading @@ -119,6 +189,16 @@ def check_hyphens(hyphens_dir): assert_font_supports_any_of_chars(font, HYPHENS) assert_font_supports_any_of_chars(font, HYPHENS) class FontRecord(object): def __init__(self, name, scripts, variant, weight, style, font): self.name = name self.scripts = scripts self.variant = variant self.weight = weight self.style = style self.font = font def parse_fonts_xml(fonts_xml_path): def parse_fonts_xml(fonts_xml_path): global _script_to_font_map, _fallback_chain global _script_to_font_map, _fallback_chain _script_to_font_map = collections.defaultdict(set) _script_to_font_map = collections.defaultdict(set) Loading Loading @@ -159,7 +239,7 @@ def parse_fonts_xml(fonts_xml_path): if index: if index: index = int(index) index = int(index) _fallback_chain.append(( _fallback_chain.append(FontRecord( name, name, frozenset(scripts), frozenset(scripts), variant, variant, Loading @@ -175,39 +255,72 @@ def parse_fonts_xml(fonts_xml_path): _script_to_font_map[script].add((font_file, index)) _script_to_font_map[script].add((font_file, index)) def check_emoji_availability(): def check_emoji_coverage(all_emoji, equivalent_emoji): emoji_fonts = [font[5] for font in _fallback_chain if 'Zsye' in font[1]] emoji_fonts = [ record.font for record in _fallback_chain if 'Zsye' in record.scripts] assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts) assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts) emoji_font = emoji_fonts[0] emoji_font = emoji_fonts[0] emoji_chars = _emoji_properties['Emoji'] coverage = get_emoji_map(emoji_font) assert_font_supports_all_of_chars(emoji_font, emoji_chars) for sequence in all_emoji: assert sequence in coverage, ( '%s is not supported in the emoji font.' % printable(sequence)) def check_emoji_defaults(): for sequence in coverage: default_emoji_chars = _emoji_properties['Emoji_Presentation'] if sequence in {0x0000, 0x000D, 0x0020}: missing_text_chars = _emoji_properties['Emoji'] - default_emoji_chars # The font needs to support a few extra characters, which is OK continue assert sequence in all_emoji, ( 'Emoji font should not support %s.' % printable(sequence)) for first, second in sorted(equivalent_emoji.items()): assert coverage[first] == coverage[second], ( '%s and %s should map to the same glyph.' % ( printable(first), printable(second))) for glyph in set(coverage.values()): maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph] if len(maps_to_glyph) > 1: # There are more than one sequences mapping to the same glyph. We # need to make sure they were expected to be equivalent. equivalent_seqs = set() for seq in maps_to_glyph: equivalent_seq = seq while equivalent_seq in equivalent_emoji: equivalent_seq = equivalent_emoji[equivalent_seq] equivalent_seqs.add(equivalent_seq) assert len(equivalent_seqs) == 1, ( 'The sequences %s should not result in the same glyph %s' % ( printable(equivalent_seqs), glyph)) def check_emoji_defaults(default_emoji): missing_text_chars = _emoji_properties['Emoji'] - default_emoji emoji_font_seen = False emoji_font_seen = False for name, scripts, variant, weight, style, font in _fallback_chain: for record in _fallback_chain: if 'Zsye' in scripts: if 'Zsye' in record.scripts: emoji_font_seen = True emoji_font_seen = True # No need to check the emoji font # No need to check the emoji font continue continue # For later fonts, we only check them if they have a script # For later fonts, we only check them if they have a script # defined, since the defined script may get them to a higher # defined, since the defined script may get them to a higher # score even if they appear after the emoji font. # score even if they appear after the emoji font. if emoji_font_seen and not scripts: if emoji_font_seen and not record.scripts: continue continue # Check default emoji-style characters # Check default emoji-style characters assert_font_supports_none_of_chars(font, sorted(default_emoji_chars)) assert_font_supports_none_of_chars(record.font, sorted(default_emoji)) # Mark default text-style characters appearing in fonts above the emoji # Mark default text-style characters appearing in fonts above the emoji # font as seen # font as seen if not emoji_font_seen: if not emoji_font_seen: missing_text_chars -= set(get_best_cmap(font)) missing_text_chars -= set(get_best_cmap(record.font)) # Noto does not have monochrome symbols for Unicode 7.0 wingdings and # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and # webdings # webdings yet. missing_text_chars -= _chars_by_age['7.0'] missing_text_chars -= _chars_by_age['7.0'] # TODO: Remove these after b/26113320 is fixed # TODO: Remove these after b/26113320 is fixed missing_text_chars -= { missing_text_chars -= { Loading Loading @@ -236,31 +349,175 @@ def parse_unicode_datafile(file_path, reverse=False): line = line.strip() line = line.strip() if not line: if not line: continue continue char_range, prop = line.split(';') char_range = char_range.strip() chars, prop = line.split(';') chars = chars.strip() prop = prop.strip() prop = prop.strip() if '..' in char_range: char_start, char_end = char_range.split('..') if ' ' in chars: # character sequence else: sequence = [int(ch, 16) for ch in chars.split(' ')] char_start = char_end = char_range additions = [tuple(sequence)] elif '..' in chars: # character range char_start, char_end = chars.split('..') char_start = int(char_start, 16) char_start = int(char_start, 16) char_end = int(char_end, 16) char_end = int(char_end, 16) char_range = xrange(char_start, char_end+1) additions = xrange(char_start, char_end+1) else: # singe character additions = [int(chars, 16)] if reverse: if reverse: output_dict[prop].update(char_range) output_dict[prop].update(additions) else: else: for char in char_range: for addition in additions: assert char not in output_dict assert addition not in output_dict output_dict[char] = prop output_dict[addition] = prop return output_dict return output_dict def parse_standardized_variants(file_path): emoji_set = set() text_set = set() with open(file_path) as datafile: for line in datafile: if '#' in line: line = line[:line.index('#')] line = line.strip() if not line: continue sequence, description, _ = line.split(';') sequence = sequence.strip().split(' ') base = int(sequence[0], 16) vs = int(sequence[1], 16) description = description.strip() if description == 'text style': text_set.add((base, vs)) elif description == 'emoji style': emoji_set.add((base, vs)) return text_set, emoji_set def parse_ucd(ucd_path): def parse_ucd(ucd_path): global _emoji_properties, _chars_by_age global _emoji_properties, _chars_by_age global _text_variation_sequences, _emoji_variation_sequences global _emoji_sequences, _emoji_zwj_sequences _emoji_properties = parse_unicode_datafile( _emoji_properties = parse_unicode_datafile( path.join(ucd_path, 'emoji-data.txt'), reverse=True) path.join(ucd_path, 'emoji-data.txt'), reverse=True) _chars_by_age = parse_unicode_datafile( _chars_by_age = parse_unicode_datafile( path.join(ucd_path, 'DerivedAge.txt'), reverse=True) path.join(ucd_path, 'DerivedAge.txt'), reverse=True) sequences = parse_standardized_variants( path.join(ucd_path, 'StandardizedVariants.txt')) _text_variation_sequences, _emoji_variation_sequences = sequences _emoji_sequences = parse_unicode_datafile( path.join(ucd_path, 'emoji-sequences.txt')) _emoji_zwj_sequences = parse_unicode_datafile( path.join(ucd_path, 'emoji-zwj-sequences.txt')) def flag_sequence(territory_code): return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code) UNSUPPORTED_FLAGS = frozenset({ flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'), flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'), flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'), flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'), flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'), flag_sequence('WF'), flag_sequence('XK'), flag_sequence('YT'), }) EQUIVALENT_FLAGS = { flag_sequence('BV'): flag_sequence('NO'), flag_sequence('CP'): flag_sequence('FR'), flag_sequence('HM'): flag_sequence('AU'), flag_sequence('SJ'): flag_sequence('NO'), flag_sequence('UM'): flag_sequence('US'), } COMBINING_KEYCAP = 0x20E3 LEGACY_ANDROID_EMOJI = { 0xFE4E5: flag_sequence('JP'), 0xFE4E6: flag_sequence('US'), 0xFE4E7: flag_sequence('FR'), 0xFE4E8: flag_sequence('DE'), 0xFE4E9: flag_sequence('IT'), 0xFE4EA: flag_sequence('GB'), 0xFE4EB: flag_sequence('ES'), 0xFE4EC: flag_sequence('RU'), 0xFE4ED: flag_sequence('CN'), 0xFE4EE: flag_sequence('KR'), 0xFE82C: (ord('#'), COMBINING_KEYCAP), 0xFE82E: (ord('1'), COMBINING_KEYCAP), 0xFE82F: (ord('2'), COMBINING_KEYCAP), 0xFE830: (ord('3'), COMBINING_KEYCAP), 0xFE831: (ord('4'), COMBINING_KEYCAP), 0xFE832: (ord('5'), COMBINING_KEYCAP), 0xFE833: (ord('6'), COMBINING_KEYCAP), 0xFE834: (ord('7'), COMBINING_KEYCAP), 0xFE835: (ord('8'), COMBINING_KEYCAP), 0xFE836: (ord('9'), COMBINING_KEYCAP), 0xFE837: (ord('0'), COMBINING_KEYCAP), } ZWJ_IDENTICALS = { # KISS (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F, # COUPLE WITH HEART (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491, # FAMILY (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A, } def compute_expected_emoji(): equivalent_emoji = {} sequence_pieces = set() all_sequences = set() all_sequences.update(_emoji_variation_sequences) for sequence in _emoji_sequences.keys(): sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) all_sequences.add(sequence) sequence_pieces.update(sequence) for sequence in _emoji_zwj_sequences.keys(): sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) all_sequences.add(sequence) sequence_pieces.update(sequence) # Add reverse of all emoji ZWJ sequences, which are added to the fonts # as a workaround to get the sequences work in RTL text. reversed_seq = tuple(reversed(sequence)) all_sequences.add(reversed_seq) equivalent_emoji[reversed_seq] = sequence # Add all two-letter flag sequences, as even the unsupported ones should # resolve to a flag tofu. all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)] all_two_letter_codes = itertools.product(all_letters, repeat=2) all_flags = {flag_sequence(code) for code in all_two_letter_codes} all_sequences.update(all_flags) tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys())) all_emoji = ( _emoji_properties['Emoji'] | all_sequences | sequence_pieces | set(LEGACY_ANDROID_EMOJI.keys())) default_emoji = ( _emoji_properties['Emoji_Presentation'] | all_sequences | set(LEGACY_ANDROID_EMOJI.keys())) first_tofu_flag = sorted(tofu_flags)[0] for flag in tofu_flags: if flag != first_tofu_flag: equivalent_emoji[flag] = first_tofu_flag equivalent_emoji.update(EQUIVALENT_FLAGS) equivalent_emoji.update(LEGACY_ANDROID_EMOJI) equivalent_emoji.update(ZWJ_IDENTICALS) for seq in _emoji_variation_sequences: equivalent_emoji[seq] = seq[0] return all_emoji, default_emoji, equivalent_emoji def main(): def main(): Loading @@ -278,8 +535,9 @@ def main(): if check_emoji == 'true': if check_emoji == 'true': ucd_path = sys.argv[3] ucd_path = sys.argv[3] parse_ucd(ucd_path) parse_ucd(ucd_path) check_emoji_availability() all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() check_emoji_defaults() check_emoji_coverage(all_emoji, equivalent_emoji) check_emoji_defaults(default_emoji) if __name__ == '__main__': if __name__ == '__main__': Loading