From 8f4dba1e6c515b08bab8c78b73b4b25629cb2eab Mon Sep 17 00:00:00 2001 From: cvzi Date: Fri, 14 Jul 2023 23:45:16 +0200 Subject: [PATCH] Extract aliases from cheat-sheet and from youtuve Fix extracting from emojiterra --- ...get_codes_from_unicode_emoji_data_files.py | 162 +++++++++++++++--- utils/requirements.txt | 4 +- 2 files changed, 138 insertions(+), 28 deletions(-) diff --git a/utils/get_codes_from_unicode_emoji_data_files.py b/utils/get_codes_from_unicode_emoji_data_files.py index 6f4e2af7..efe39691 100644 --- a/utils/get_codes_from_unicode_emoji_data_files.py +++ b/utils/get_codes_from_unicode_emoji_data_files.py @@ -57,15 +57,98 @@ def get_emojiterra_from_url(url: str) -> dict: emojis = {} data = soup.find_all('li') - data = [i for i in data if 'href' not in str(i) and 'data-e' in i] + data = [i for i in data if 'href' not in i.attrs and 'data-e' in i.attrs and i['data-e'].strip()] for i in data: code = i['data-e'] emojis[code] = i['title'].strip() + assert len(data) > 100, f"emojiterra data from {url} has only {len(data)} entries" + return emojis +def get_cheat_sheet(url: str) -> dict: + """ + Returns a dict of emoji to short-names: + E.g. {'👴': ':old_man:', '👵': ':old_woman:', ... } + """ + + html = get_text_from_url(url) + + soup = bs4.BeautifulSoup(html, "html.parser") + emojis = {} + + items = soup.find(class_='ecs-list').find_all(class_='_item') + + pattern = re.compile(r'U\+([0-9A-F]+)') + + for i in items: + unicode_text = i.find(class_='unicode').text + + code_points = pattern.findall(unicode_text) + code = ''.join(chr(int(x,16)) for x in code_points) + + emojis[code] = i.find(class_='shortcode').text + + # Remove some unwanted and some weird entries from the cheat sheet + filtered = {} + for emj, short_code in emojis.items(): + + if short_code.startswith(':flag_'): + # Skip flags from cheat-sheet, because we already have very similar aliases for the flags + continue + + if '⊛' in short_code: + # Strange emoji with ⊛ in the short-code + continue + + if emj == '\U0001F93E\U0000200D\U00002640\U0000FE0F': + # The short-code for this emoji is wrong + continue + + if emj == '\U0001F468\U0000200D\U0001F468\U0000200D\U0001F467': + # The short-code for this emoji is wrong + continue + + if short_code.startswith('::'): + # Do not allow short-codes to have double :: at the start + short_code = short_code[1:] + + if short_code.endswith('::'): + # Do not allow short-codes to have double :: at the end + short_code = short_code[:-1] + + filtered[emj] = short_code + + assert len(filtered) > 100, f"emoji-cheat-sheet data from {url} has only {len(filtered)} entries" + + return filtered + +def get_emoji_from_youtube(url: str) -> dict: + """Get emoji alias from Youtube + Returns a dict of emoji to list of short-names: + E.g. {'💁': [':person_tipping_hand:', ':information_desk_person:'], '😉': [':winking_face:', ':wink:']} + """ + + data = requests.get(url).json() + + output = {} + for obj in data: + if 'shortcuts' not in obj or 'emojiId' not in obj: + continue + + shortcuts = [x for x in obj['shortcuts'] if x.startswith(':') and x.endswith(':')] + + if shortcuts: + output[obj['emojiId']] = shortcuts + + assert len(output) > 100, f"youtube data from {url} has only {len(output)} entries" + + return output + + + def extract_emojis(emojis_lines: list, sequences_lines: list) -> dict: """Extract emojis line by line to dict""" @@ -320,11 +403,11 @@ def extract_names(xml, lang, emoji_terra={}): return data -def get_emoji_from_github_api() -> dict: +def get_emoji_from_github_api(url: str) -> dict: """Get emoji alias from GitHub API """ - data = requests.get("https://api.github.com/emojis").json() + data = requests.get(url).json() pattern = re.compile(r"unicode/([0-9a-fA-F-]+)\.[a-z]+") output = {} @@ -336,13 +419,15 @@ def get_emoji_from_github_api() -> dict: else: pass # Special GitHub emoji that is not part of Unicode + assert len(output) > 100, f"data from github API has only {len(output)} entries" + return output GITHUB_REMOVED_CHARS = re.compile("\u200D|\uFE0F|\uFE0E", re.IGNORECASE) -def find_github_aliases(emj, github_dict): +def find_github_aliases(emj, github_dict, v, emj_no_variant=None): aliases = set() # Strip ZWJ \u200D, text_type \uFE0E and emoji_type \uFE0F @@ -366,12 +451,15 @@ def ascii(s): if __name__ == "__main__": + logging.info(' Downloading...\n') + # Find the latest version at https://www.unicode.org/reports/tr51/#emoji_data emoji_source = get_emoji_from_url(15.0) emoji_sequences_source = get_emoji_variation_sequence_from_url('15.0.0') emojis = extract_emojis(emoji_source, emoji_sequences_source) # Find latest release tag at https://cldr.unicode.org/index/downloads - github_tag = 'release-42' + github_tag = 'release-43' + languages = { # Update names in other languages: 'de': extract_names(get_language_data_from_url(github_tag, 'de'), 'de', get_emojiterra_from_url('https://emojiterra.com/de/kopieren/')), @@ -389,6 +477,8 @@ def ascii(s): # 'de': get_UNICODE_EMOJI('de'), # 'es': get_UNICODE_EMOJI('es'), # 'fr': get_UNICODE_EMOJI('fr'), + # 'ja': get_UNICODE_EMOJI('ja'), + # 'ko': get_UNICODE_EMOJI('ko'), # 'pt': get_UNICODE_EMOJI('pt'), # 'it': get_UNICODE_EMOJI('it'), # 'fa': get_UNICODE_EMOJI('fa'), @@ -396,18 +486,30 @@ def ascii(s): # 'zh': get_UNICODE_EMOJI('zh'), } - github_alias_dict = get_emoji_from_github_api() + github_alias_dict = get_emoji_from_github_api('https://api.github.com/emojis') + cheat_sheet_dict = get_cheat_sheet('https://www.webfx.com/tools/emoji-cheat-sheet/') + youtube_dict = get_emoji_from_youtube('https://www.gstatic.com/youtube/img/emojis/emojis-png-7.json') + + logging.info(' Combining...\n') + used_github_aliases = set() escapedToUnicodeMap = {escaped: escaped.encode().decode('unicode-escape') for escaped in emojis} # maps: "\\U0001F4A4" to "\U0001F4A4" + all_existing_aliases_and_en = set(item for emj_data in emoji_pkg.EMOJI_DATA.values() for item in emj_data.get('alias', [])) + all_existing_aliases_and_en.update(emj_data['en'] for emj_data in emoji_pkg.EMOJI_DATA.values()) + f = 0 c = 0 new_aliases = [] - # Print the dict of dicts + logging.info(' Print EMOJI_DATA...\n') for code, v in sorted(emojis.items(), key=lambda item: item[1]["en"]): language_str = '' emj = escapedToUnicodeMap[code] + + alternative = re.sub(r"\\U0000FE0[EF]$", "", code) + emj_no_variant = escapedToUnicodeMap[alternative] + # add names in other languages for lang in languages: if emj in languages[lang]: @@ -415,8 +517,6 @@ def ascii(s): lang, languages[lang][emj]) elif 'variant' in v: # the language annotation uses the normal emoji (no variant), while the emoji-test.txt uses the emoji or text variant - alternative = re.sub(r"\\U0000FE0[EF]$", "", code) # Strip the variant - emj_no_variant = escapedToUnicodeMap[alternative] if emj_no_variant in languages[lang]: language_str += ",\n '%s': '%s'" % ( lang, languages[lang][emj_no_variant]) @@ -427,17 +527,26 @@ def ascii(s): aliases.update(a[1:-1] for a in emoji_pkg.EMOJI_DATA[emj]['alias']) old_aliases = set(aliases) - if 'variant' in v: - alternative = re.sub(r"\\U0000FE0[EF]$", "", code) - emj_no_variant = escapedToUnicodeMap[alternative] - if emj_no_variant in emoji_pkg.EMOJI_DATA and 'alias' in emoji_pkg.EMOJI_DATA[emj_no_variant]: - aliases.update(a[1:-1] for a in emoji_pkg.EMOJI_DATA[emj_no_variant]['alias']) + if emj_no_variant in emoji_pkg.EMOJI_DATA and 'alias' in emoji_pkg.EMOJI_DATA[emj_no_variant]: + aliases.update(a[1:-1] for a in emoji_pkg.EMOJI_DATA[emj_no_variant]['alias']) - # Add alias from GitHub API - github_aliases = find_github_aliases(emj, github_alias_dict) - aliases.update(github_aliases) + # Add alias from GitHub API + github_aliases = find_github_aliases(emj, github_alias_dict, v, emj_no_variant) + aliases.update(shortcut for shortcut in github_aliases if shortcut not in all_existing_aliases_and_en) used_github_aliases.update(github_aliases) + # Add alias from cheat sheet + if emj in cheat_sheet_dict and cheat_sheet_dict[emj] not in all_existing_aliases_and_en: + aliases.add(cheat_sheet_dict[emj][1:-1]) + if emj_no_variant in cheat_sheet_dict and cheat_sheet_dict[emj_no_variant] not in all_existing_aliases_and_en: + aliases.add(cheat_sheet_dict[emj_no_variant][1:-1]) + + # Add alias from youtube + if emj in youtube_dict: + aliases.update(shortcut[1:-1] for shortcut in youtube_dict[emj] if shortcut not in all_existing_aliases_and_en) + if emj_no_variant in youtube_dict: + aliases.update(shortcut[1:-1] for shortcut in youtube_dict[emj_no_variant] if shortcut not in all_existing_aliases_and_en) + # Remove if alias is same as 'en'-name if v["en"] in aliases: aliases.remove(v["en"]) @@ -451,10 +560,9 @@ def ascii(s): for a in diff: new_aliases.append(f"# alias NEW {a} FOR {emj} CODE {code}") - # Try to keep order of aliases intact - if aliases == old_aliases and emj in emoji_pkg.EMOJI_DATA and 'alias' in emoji_pkg.EMOJI_DATA[emj]: - # Use list instead of set, if there are no new aliases to keep the order intact - aliases = [a[1:-1] for a in emoji_pkg.EMOJI_DATA[emj]['alias']] + # Keep the order of existing aliases intact + if emj in emoji_pkg.EMOJI_DATA and 'alias' in emoji_pkg.EMOJI_DATA[emj]: + aliases = [a[1:-1] for a in emoji_pkg.EMOJI_DATA[emj]['alias']] + [a for a in aliases if f":{a}:" not in emoji_pkg.EMOJI_DATA[emj]['alias']] if any("flag_for_" in a for a in aliases): # Put the :flag_for_COUNTRY: alias as the first entry so that it gets picked by demojize() @@ -477,12 +585,14 @@ def ascii(s): elif v["status"] == "component": c += 1 - print("\n# Total count of emojis: ", len(emojis)) - print("# fully_qualified: ", f) - print("# component: ", c) - print("\n".join(new_aliases)) + logging.debug(f" # Total count of emojis: {len(emojis)}") + logging.debug(f" # fully_qualified: {f}") + logging.debug(f" # component: {c}\n") + logging.debug("\n".join(new_aliases)) # Check if all aliases from GitHub API were used for github_alias in github_alias_dict: if github_alias not in used_github_aliases: - print("# Unused Github alias:", github_alias, github_alias_dict[github_alias], ascii(github_alias_dict[github_alias])) + logging.debug(f"# Unused Github alias: {github_alias} {github_alias_dict[github_alias]} {ascii(github_alias_dict[github_alias])}") + + logging.info('\n\n Done.') \ No newline at end of file diff --git a/utils/requirements.txt b/utils/requirements.txt index 4f441389..4355c92b 100644 --- a/utils/requirements.txt +++ b/utils/requirements.txt @@ -1,2 +1,2 @@ -requests>=2.28.1 -beautifulsoup4>=4.11.1 \ No newline at end of file +requests>=2.31.0 +beautifulsoup4>=4.12.2 \ No newline at end of file