From 89feb6d5fd38aa9b493d6fc3ca5b546c373aac31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Thu, 18 Apr 2019 15:30:50 +0200 Subject: [PATCH 01/15] Clean up unicode.py script --- .gitignore | 1 + src/libcore/unicode/unicode.py | 372 ++++++++++++++++++++++++--------- 2 files changed, 270 insertions(+), 103 deletions(-) diff --git a/.gitignore b/.gitignore index 67e0dd8e795bb..51f3e722ca7d8 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ __pycache__/ /src/libcore/unicode/Scripts.txt /src/libcore/unicode/SpecialCasing.txt /src/libcore/unicode/UnicodeData.txt +/src/libcore/unicode/downloaded /stage[0-9]+/ /target target/ diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index ae356c3ff4459..97c11fb795ea8 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -1,35 +1,71 @@ #!/usr/bin/env python -# This script uses the following Unicode tables: -# - DerivedCoreProperties.txt -# - DerivedNormalizationProps.txt -# - EastAsianWidth.txt -# - auxiliary/GraphemeBreakProperty.txt -# - PropList.txt -# - ReadMe.txt -# - Scripts.txt -# - UnicodeData.txt -# +""" +Regenerate Unicode tables (tables.rs). +""" + +# This script uses the Unicode tables as defined +# in the UnicodeFiles class. + # Since this should not require frequent updates, we just store this # out-of-line and check the tables.rs file into git. -import fileinput, re, os, sys, operator, math, datetime +# Note that the "curl" program is required for operation. +# This script is compatible with Python 2.7 and 3.x. + +import argparse +import datetime +import fileinput +import operator +import os +import re +import textwrap +import subprocess + +from collections import namedtuple + + +# we don't use enum.Enum because of Python 2.7 compatibility +class UnicodeFiles(object): + # ReadMe does not contain any unicode data, we + # use it to extract versions. + README = "ReadMe.txt" + + DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" + DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt" + SPECIAL_CASING = "SpecialCasing.txt" + SCRIPTS = "Scripts.txt" + PROPS = "PropList.txt" + UNICODE_DATA = "UnicodeData.txt" + + +UnicodeFiles.ALL_FILES = tuple( + getattr(UnicodeFiles, name) for name in dir(UnicodeFiles) + if not name.startswith("_") +) -# The directory in which this file resides. -fdir = os.path.dirname(os.path.realpath(__file__)) + "/" +# The directory this file is located in. +THIS_DIR = os.path.dirname(os.path.realpath(__file__)) -preamble = ''' +# Where to download the Unicode data. The downloaded files +# will be placed in sub-directories named after Unicode version. +FETCH_DIR = os.path.join(THIS_DIR, "downloaded") + +FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}" +FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}" + +PREAMBLE = """\ // NOTE: The following code was generated by "./unicode.py", do not edit directly #![allow(missing_docs, non_upper_case_globals, non_snake_case)] use unicode::version::UnicodeVersion; use unicode::bool_trie::{{BoolTrie, SmallBoolTrie}}; -'''.format(year = datetime.datetime.now().year) +""".format(year=datetime.datetime.now().year) # Mapping taken from Table 12 from: # http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { +EXPANDED_CATEGORIES = { 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], 'Lm': ['L'], 'Lo': ['L'], 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], @@ -42,22 +78,101 @@ } # these are the surrogate codepoints, which are not valid rust characters -surrogate_codepoints = (0xd800, 0xdfff) +SURROGATE_CODEPOINTS = (0xd800, 0xdfff) + +UnicodeData = namedtuple( + "UnicodeData", ("canon_decomp", "compat_decomp", "gencats", "combines", + "to_upper", "to_lower", "to_title", ) +) + +UnicodeVersion = namedtuple( + "UnicodeVersion", ("major", "minor", "micro", "as_str") +) + + +def fetch_files(version=None): + """ + Fetch all the Unicode files from unicode.org + + :param version: The desired Unicode version, as string. + (If None, defaults to latest final release available). + :return: The version downloaded (UnicodeVersion object). + """ + have_version = should_skip_fetch(version) + if have_version: + return have_version + + if version: + # check if the desired version exists on the server + get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name) + else: + # extract the latest version + get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name) + + readme_url = get_fetch_url(UnicodeFiles.README) + + print("Fetching: {}".format(readme_url)) + readme_content = subprocess.check_output(("curl", readme_url)) + + unicode_version = parse_unicode_version( + str(readme_content, "utf8") + ) + + download_dir = os.path.join(FETCH_DIR, unicode_version.as_str) + if not os.path.exists(download_dir): + # for 2.7 compat, we don't use exist_ok=True + os.makedirs(download_dir) + + for filename in UnicodeFiles.ALL_FILES: + file_path = os.path.join(download_dir, filename) + + if filename == UnicodeFiles.README: + with open(file_path, "wb") as fd: + fd.write(readme_content) + elif not os.path.exists(file_path): + url = get_fetch_url(filename) + print("Fetching: {}".format(url)) + subprocess.check_call(("curl", "-o", file_path, url)) + + return unicode_version + + +def should_skip_fetch(version): + if not version: + # should always check latest version + return False + + fetch_dir = os.path.join(FETCH_DIR, version) + + for filename in UnicodeFiles.ALL_FILES: + file_path = os.path.join(fetch_dir, filename) + + if not os.path.exists(file_path): + return False + + with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd: + return parse_unicode_version(fd.read()) + + +def parse_unicode_version(readme_content): + # "raw string" is necessary for \d not being treated as escape char + # (for the sake of compat with future Python versions) + # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + groups = re.search(pattern, readme_content).groups() + + return UnicodeVersion(*map(int, groups), as_str=".".join(groups)) + -def fetch(f): - path = fdir + os.path.basename(f) - if not os.path.exists(path): - os.system("curl -o {0}{1} ftp://ftp.unicode.org/Public/UNIDATA/{1}".format(fdir, f)) +def get_unicode_file_path(unicode_version, filename): + return os.path.join(FETCH_DIR, unicode_version.as_str, filename) - if not os.path.exists(path): - sys.stderr.write("cannot load %s" % f) - exit(1) def is_surrogate(n): - return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] + return SURROGATE_CODEPOINTS[0] <= n <= SURROGATE_CODEPOINTS[1] -def load_unicode_data(f): - fetch(f) + +def load_unicode_data(file_path): gencats = {} to_lower = {} to_upper = {} @@ -68,8 +183,8 @@ def load_unicode_data(f): udict = {} range_start = -1 - for line in fileinput.input(fdir + f): - data = line.split(';') + for line in fileinput.input(file_path): + data = line.split(";") if len(data) != 15: continue cp = int(data[0], 16) @@ -104,7 +219,7 @@ def load_unicode_data(f): # store decomposition, if given if decomp != "": - if decomp.startswith('<'): + if decomp.startswith("<"): seq = [] for i in decomp.split()[1:]: seq.append(int(i, 16)) @@ -116,7 +231,7 @@ def load_unicode_data(f): canon_decomp[code] = seq # place letter in categories as appropriate - for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): + for cat in [gencat, "Assigned"] + EXPANDED_CATEGORIES.get(gencat, []): if cat not in gencats: gencats[cat] = [] gencats[cat].append(code) @@ -136,12 +251,15 @@ def load_unicode_data(f): gencats = group_cats(gencats) combines = to_combines(group_cats(combines)) - return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title) + return UnicodeData( + canon_decomp, compat_decomp, gencats, combines, to_upper, + to_lower, to_title, + ) + -def load_special_casing(f, to_upper, to_lower, to_title): - fetch(f) - for line in fileinput.input(fdir + f): - data = line.split('#')[0].split(';') +def load_special_casing(file_path, unicode_data): + for line in fileinput.input(file_path): + data = line.split("#")[0].split(";") if len(data) == 5: code, lower, title, upper, _comment = data elif len(data) == 6: @@ -155,7 +273,9 @@ def load_special_casing(f, to_upper, to_lower, to_title): title = title.strip() upper = upper.strip() key = int(code, 16) - for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]: + for (map_, values) in ((unicode_data.to_lower, lower), + (unicode_data.to_upper, upper), + (unicode_data.to_title, title)): if values != code: values = [int(i, 16) for i in values.split()] for _ in range(len(values), 3): @@ -163,12 +283,14 @@ def load_special_casing(f, to_upper, to_lower, to_title): assert len(values) == 3 map_[key] = values + def group_cats(cats): cats_out = {} for cat in cats: cats_out[cat] = group_cat(cats[cat]) return cats_out + def group_cat(cat): cat_out = [] letters = sorted(set(cat)) @@ -185,6 +307,7 @@ def group_cat(cat): cat_out.append((cur_start, cur_end)) return cat_out + def ungroup_cat(cat): cat_out = [] for (lo, hi) in cat: @@ -193,21 +316,24 @@ def ungroup_cat(cat): lo += 1 return cat_out + def gen_unassigned(assigned): assigned = set(assigned) return ([i for i in range(0, 0xd800) if i not in assigned] + [i for i in range(0xe000, 0x110000) if i not in assigned]) + def to_combines(combs): combs_out = [] for comb in combs: for (lo, hi) in combs[comb]: combs_out.append((lo, hi, comb)) - combs_out.sort(key=lambda comb: comb[0]) + combs_out.sort(key=lambda c: c[0]) return combs_out + def format_table_content(f, content, indent): - line = " "*indent + line = " " * indent first = True for chunk in content.split(","): if len(line) + len(chunk) < 98: @@ -218,16 +344,19 @@ def format_table_content(f, content, indent): first = False else: f.write(line + ",\n") - line = " "*indent + chunk + line = " " * indent + chunk f.write(line) -def load_properties(f, interestingprops): - fetch(f) + +def load_properties(file_path, interestingprops): props = {} - re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") - re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") + # "raw string" is necessary for \w not to be treated as escape char + # (for the sake of compat with future Python versions) + # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") - for line in fileinput.input(fdir + os.path.basename(f)): + for line in fileinput.input(file_path): prop = None d_lo = 0 d_hi = 0 @@ -258,10 +387,12 @@ def load_properties(f, interestingprops): return props + def escape_char(c): return "'\\u{%x}'" % c if c != 0 else "'\\0'" -def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True, + +def emit_table(f, name, t_data, t_type="&[(char, char)]", is_pub=True, pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): pub_string = "" if is_pub: @@ -277,6 +408,7 @@ def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True, format_table_content(f, data, 8) f.write("\n ];\n\n") + def compute_trie(rawdata, chunksize): root = [] childmap = {} @@ -288,10 +420,11 @@ def compute_trie(rawdata, chunksize): childmap[child] = len(childmap) child_data.extend(data) root.append(childmap[child]) - return (root, child_data) + return root, child_data + def emit_bool_trie(f, name, t_data, is_pub=True): - CHUNK = 64 + chunk_size = 64 rawdata = [False] * 0x110000 for (lo, hi) in t_data: for cp in range(lo, hi + 1): @@ -299,7 +432,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): # convert to bitmap chunks of 64 bits each chunks = [] - for i in range(0x110000 // CHUNK): + for i in range(0x110000 // chunk_size): chunk = 0 for j in range(64): if rawdata[i * 64 + j]: @@ -311,12 +444,12 @@ def emit_bool_trie(f, name, t_data, is_pub=True): pub_string = "pub " f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)) f.write(" r1: [\n") - data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK]) + data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // chunk_size]) format_table_content(f, data, 12) f.write("\n ],\n") # 0x800..0x10000 trie - (r2, r3) = compute_trie(chunks[0x800 // CHUNK : 0x10000 // CHUNK], 64 // CHUNK) + (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size) f.write(" r2: [\n") data = ','.join(str(node) for node in r2) format_table_content(f, data, 12) @@ -327,7 +460,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): f.write("\n ],\n") # 0x10000..0x110000 trie - (mid, r6) = compute_trie(chunks[0x10000 // CHUNK : 0x110000 // CHUNK], 64 // CHUNK) + (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size) (r4, r5) = compute_trie(mid, 64) f.write(" r4: [\n") data = ','.join(str(node) for node in r4) @@ -344,6 +477,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): f.write(" };\n\n") + def emit_small_bool_trie(f, name, t_data, is_pub=True): last_chunk = max(hi // 64 for (lo, hi) in t_data) n_chunks = last_chunk + 1 @@ -374,6 +508,7 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True): f.write(" };\n\n") + def emit_property_module(f, mod, tbl, emit): f.write("pub mod %s {\n" % mod) for cat in sorted(emit): @@ -389,7 +524,8 @@ def emit_property_module(f, mod, tbl, emit): f.write(" }\n\n") f.write("}\n\n") -def emit_conversions_module(f, to_upper, to_lower, to_title): + +def emit_conversions_module(f, unicode_data): f.write("pub mod conversions {") f.write(""" pub fn to_lower(c: char) -> [char; 3] { @@ -414,74 +550,104 @@ def emit_conversions_module(f, to_upper, to_lower, to_title): t_type = "&[(char, [char; 3])]" pfun = lambda x: "(%s,[%s,%s,%s])" % ( escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])) - emit_table(f, "to_lowercase_table", - sorted(to_lower.items(), key=operator.itemgetter(0)), - is_pub=False, t_type = t_type, pfun=pfun) - emit_table(f, "to_uppercase_table", - sorted(to_upper.items(), key=operator.itemgetter(0)), - is_pub=False, t_type = t_type, pfun=pfun) - f.write("}\n\n") -def emit_norm_module(f, canon, compat, combine, norm_props): - canon_keys = sorted(canon.keys()) + emit_table(f, + name="to_lowercase_table", + t_data=sorted(unicode_data.to_lower.items(), key=operator.itemgetter(0)), + t_type=t_type, + is_pub=False, + pfun=pfun) - compat_keys = sorted(compat.keys()) + emit_table(f, + name="to_uppercase_table", + t_data=sorted(unicode_data.to_upper.items(), key=operator.itemgetter(0)), + t_type=t_type, + is_pub=False, + pfun=pfun) + + f.write("}\n") + + +def emit_norm_module(f, unicode_data, norm_props): + canon_keys = sorted(unicode_data.canon_decomp.keys()) canon_comp = {} comp_exclusions = norm_props["Full_Composition_Exclusion"] for char in canon_keys: if any(lo <= char <= hi for lo, hi in comp_exclusions): continue - decomp = canon[char] + decomp = unicode_data.canon_decomp[char] if len(decomp) == 2: if decomp[0] not in canon_comp: canon_comp[decomp[0]] = [] - canon_comp[decomp[0]].append( (decomp[1], char) ) - canon_comp_keys = sorted(canon_comp.keys()) + canon_comp[decomp[0]].append((decomp[1], char)) -if __name__ == "__main__": - r = fdir + "tables.rs" - if os.path.exists(r): - os.remove(r) - with open(r, "w") as rf: + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", default=None, type=str, + help="Unicode version to use (if not specified," + " defaults to latest available final release).") + + return parser.parse_args() + + +def main(): + args = parse_args() + + unicode_version = fetch_files(args.version) + print("Using Unicode version: {}".format(unicode_version.as_str)) + + tables_rs_path = os.path.join(THIS_DIR, "tables.rs") + if os.path.exists(tables_rs_path): + os.remove(tables_rs_path) + + with open(tables_rs_path, "w") as rf: # write the file's preamble - rf.write(preamble) - - # download and parse all the data - fetch("ReadMe.txt") - with open(fdir + "ReadMe.txt") as readme: - pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - unicode_version = re.search(pattern, readme.read()).groups() - rf.write(""" -/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of -/// `char` and `str` methods are based on. -#[unstable(feature = "unicode_version", issue = "49726")] -pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion { - major: %s, - minor: %s, - micro: %s, - _priv: (), -}; -""" % unicode_version) - (canon_decomp, compat_decomp, gencats, combines, - to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt") - load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title) + rf.write(PREAMBLE) + + unicode_version_notice = textwrap.dedent(""" + /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of + /// `char` and `str` methods are based on. + #[unstable(feature = "unicode_version", issue = "49726")] + pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{ + major: {version.major}, + minor: {version.minor}, + micro: {version.micro}, + _priv: (), + }}; + """).format(version=unicode_version) + rf.write(unicode_version_notice) + + get_path = lambda f: get_unicode_file_path(unicode_version, f) + + unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA)) + load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data) + want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", "Cased", "Case_Ignorable", "Grapheme_Extend"] - derived = load_properties("DerivedCoreProperties.txt", want_derived) - scripts = load_properties("Scripts.txt", []) - props = load_properties("PropList.txt", - ["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"]) - norm_props = load_properties("DerivedNormalizationProps.txt", - ["Full_Composition_Exclusion"]) + derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) + + # TODO scripts not used? + scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), []) + props = load_properties(get_path(UnicodeFiles.PROPS), + ["White_Space", "Join_Control", "Noncharacter_Code_Point", + "Pattern_White_Space"]) + norm_props = load_properties(get_path(UnicodeFiles.DERIVED_NORMALIZATION_PROPS), + ["Full_Composition_Exclusion"]) # category tables - for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \ - ("derived_property", derived, want_derived), \ - ("property", props, ["White_Space", "Pattern_White_Space"]): + for (name, cat, pfuns) in (("general_category", unicode_data.gencats, ["N", "Cc"]), + ("derived_property", derived, want_derived), + ("property", props, ["White_Space", "Pattern_White_Space"])): emit_property_module(rf, name, cat, pfuns) # normalizations and conversions module - emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) - emit_conversions_module(rf, to_upper, to_lower, to_title) + emit_norm_module(rf, unicode_data, norm_props) + emit_conversions_module(rf, unicode_data) + print("Regenerated tables.rs.") + + +if __name__ == "__main__": + main() From a580421afbd6ee93aaab0ad01dee3df8343a88dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Thu, 18 Apr 2019 16:16:34 +0200 Subject: [PATCH 02/15] More cleanups for unicode.py --- src/libcore/unicode/unicode.py | 48 ++++++++++++++++------------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index 97c11fb795ea8..447f4274c18da 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -28,14 +28,14 @@ # we don't use enum.Enum because of Python 2.7 compatibility class UnicodeFiles(object): # ReadMe does not contain any unicode data, we - # use it to extract versions. + # only use it to extract versions. README = "ReadMe.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt" - SPECIAL_CASING = "SpecialCasing.txt" - SCRIPTS = "Scripts.txt" PROPS = "PropList.txt" + SCRIPTS = "Scripts.txt" + SPECIAL_CASING = "SpecialCasing.txt" UNICODE_DATA = "UnicodeData.txt" @@ -66,15 +66,15 @@ class UnicodeFiles(object): # Mapping taken from Table 12 from: # http://www.unicode.org/reports/tr44/#General_Category_Values EXPANDED_CATEGORIES = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['N'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], + "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"], + "Lm": ["L"], "Lo": ["L"], + "Mn": ["M"], "Mc": ["M"], "Me": ["M"], + "Nd": ["N"], "Nl": ["N"], "No": ["N"], + "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"], + "Pi": ["P"], "Pf": ["P"], "Po": ["P"], + "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"], + "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"], + "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"], } # these are the surrogate codepoints, which are not valid rust characters @@ -115,7 +115,7 @@ def fetch_files(version=None): readme_content = subprocess.check_output(("curl", readme_url)) unicode_version = parse_unicode_version( - str(readme_content, "utf8") + readme_content.decode("utf8") ) download_dir = os.path.join(FETCH_DIR, unicode_version.as_str) @@ -415,7 +415,7 @@ def compute_trie(rawdata, chunksize): child_data = [] for i in range(len(rawdata) // chunksize): data = rawdata[i * chunksize: (i + 1) * chunksize] - child = '|'.join(map(str, data)) + child = "|".join(map(str, data)) if child not in childmap: childmap[child] = len(childmap) child_data.extend(data) @@ -444,18 +444,18 @@ def emit_bool_trie(f, name, t_data, is_pub=True): pub_string = "pub " f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)) f.write(" r1: [\n") - data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // chunk_size]) + data = ",".join("0x%016x" % chunk for chunk in chunks[0:0x800 // chunk_size]) format_table_content(f, data, 12) f.write("\n ],\n") # 0x800..0x10000 trie (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size) f.write(" r2: [\n") - data = ','.join(str(node) for node in r2) + data = ",".join(str(node) for node in r2) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r3: &[\n") - data = ','.join('0x%016x' % chunk for chunk in r3) + data = ",".join("0x%016x" % chunk for chunk in r3) format_table_content(f, data, 12) f.write("\n ],\n") @@ -463,15 +463,15 @@ def emit_bool_trie(f, name, t_data, is_pub=True): (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size) (r4, r5) = compute_trie(mid, 64) f.write(" r4: [\n") - data = ','.join(str(node) for node in r4) + data = ",".join(str(node) for node in r4) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r5: &[\n") - data = ','.join(str(node) for node in r5) + data = ",".join(str(node) for node in r5) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r6: &[\n") - data = ','.join('0x%016x' % chunk for chunk in r6) + data = ",".join("0x%016x" % chunk for chunk in r6) format_table_content(f, data, 12) f.write("\n ],\n") @@ -497,12 +497,12 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True): (r1, r2) = compute_trie(chunks, 1) f.write(" r1: &[\n") - data = ','.join(str(node) for node in r1) + data = ",".join(str(node) for node in r1) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r2: &[\n") - data = ','.join('0x%016x' % node for node in r2) + data = ",".join("0x%016x" % node for node in r2) format_table_content(f, data, 12) f.write("\n ],\n") @@ -599,11 +599,9 @@ def main(): print("Using Unicode version: {}".format(unicode_version.as_str)) tables_rs_path = os.path.join(THIS_DIR, "tables.rs") - if os.path.exists(tables_rs_path): - os.remove(tables_rs_path) + # will overwrite the file if it exists with open(tables_rs_path, "w") as rf: - # write the file's preamble rf.write(PREAMBLE) unicode_version_notice = textwrap.dedent(""" From edbc27da2dc5a75f8e0ac70e7a5e07aa6f6f0a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Thu, 18 Apr 2019 17:14:31 +0200 Subject: [PATCH 03/15] Fix tidy errors --- src/libcore/unicode/unicode.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index 447f4274c18da..e645c3f33c84c 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -460,7 +460,8 @@ def emit_bool_trie(f, name, t_data, is_pub=True): f.write("\n ],\n") # 0x10000..0x110000 trie - (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size) + (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], + 64 // chunk_size) (r4, r5) = compute_trie(mid, 64) f.write(" r4: [\n") data = ",".join(str(node) for node in r4) @@ -626,7 +627,7 @@ def main(): "Cased", "Case_Ignorable", "Grapheme_Extend"] derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) - # TODO scripts not used? + # FIXME scripts not used? scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), []) props = load_properties(get_path(UnicodeFiles.PROPS), ["White_Space", "Join_Control", "Noncharacter_Code_Point", From 2c9c978e1d4a3541d8df593346c7520c8ef4d69e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Fri, 19 Apr 2019 11:42:08 +0200 Subject: [PATCH 04/15] Refactor and document unicode.py script --- src/libcore/unicode/unicode.py | 820 +++++++++++++++++++++------------ 1 file changed, 518 insertions(+), 302 deletions(-) diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index e645c3f33c84c..f66e82299100d 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -16,13 +16,31 @@ import argparse import datetime import fileinput -import operator +import itertools import os import re import textwrap import subprocess -from collections import namedtuple +from collections import defaultdict, namedtuple + +try: + # Python 3 + from itertools import zip_longest + from io import StringIO +except ImportError: + # Python 2 compatibility + zip_longest = itertools.izip_longest + from StringIO import StringIO + +try: + # completely optional type hinting + # (Python 2 compatible using comments, + # see: https://mypy.readthedocs.io/en/latest/python2.html) + # This is very helpful in typing-aware IDE like PyCharm. + from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple +except ImportError: + pass # we don't use enum.Enum because of Python 2.7 compatibility @@ -77,12 +95,21 @@ class UnicodeFiles(object): "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"], } -# these are the surrogate codepoints, which are not valid rust characters -SURROGATE_CODEPOINTS = (0xd800, 0xdfff) +# this is the surrogate codepoints range (both ends inclusive) +# - they are not valid Rust characters +SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff) UnicodeData = namedtuple( - "UnicodeData", ("canon_decomp", "compat_decomp", "gencats", "combines", - "to_upper", "to_lower", "to_title", ) + "UnicodeData", ( + # conversions: + "to_upper", "to_lower", "to_title", + + # decompositions: canonical decompositions, compatibility decomp + "canon_decomp", "compat_decomp", + + # grouped: general categories and combining characters + "general_categories", "combines", + ) ) UnicodeVersion = namedtuple( @@ -91,14 +118,19 @@ class UnicodeFiles(object): def fetch_files(version=None): + # type: (str) -> UnicodeVersion """ - Fetch all the Unicode files from unicode.org + Fetch all the Unicode files from unicode.org. + + This will use cached files (stored in FETCH_DIR) if they exist, + creating them if they don't. In any case, the Unicode version + is always returned. :param version: The desired Unicode version, as string. - (If None, defaults to latest final release available). - :return: The version downloaded (UnicodeVersion object). + (If None, defaults to latest final release available, + querying the unicode.org service). """ - have_version = should_skip_fetch(version) + have_version = check_stored_version(version) if have_version: return have_version @@ -114,22 +146,26 @@ def fetch_files(version=None): print("Fetching: {}".format(readme_url)) readme_content = subprocess.check_output(("curl", readme_url)) - unicode_version = parse_unicode_version( + unicode_version = parse_readme_unicode_version( readme_content.decode("utf8") ) - download_dir = os.path.join(FETCH_DIR, unicode_version.as_str) + download_dir = get_unicode_dir(unicode_version) if not os.path.exists(download_dir): # for 2.7 compat, we don't use exist_ok=True os.makedirs(download_dir) for filename in UnicodeFiles.ALL_FILES: - file_path = os.path.join(download_dir, filename) + file_path = get_unicode_file_path(unicode_version, filename) + + if os.path.exists(file_path): + # assume file on the server didn't change if it's been saved before + continue if filename == UnicodeFiles.README: with open(file_path, "wb") as fd: fd.write(readme_content) - elif not os.path.exists(file_path): + else: url = get_fetch_url(filename) print("Fetching: {}".format(url)) subprocess.check_call(("curl", "-o", file_path, url)) @@ -137,10 +173,15 @@ def fetch_files(version=None): return unicode_version -def should_skip_fetch(version): +def check_stored_version(version): + # type: (Optional[str]) -> Optional[UnicodeVersion] + """ + Given desired Unicode version, return the version + if stored files are all present, and None otherwise. + """ if not version: # should always check latest version - return False + return None fetch_dir = os.path.join(FETCH_DIR, version) @@ -148,13 +189,17 @@ def should_skip_fetch(version): file_path = os.path.join(fetch_dir, filename) if not os.path.exists(file_path): - return False + return None with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd: - return parse_unicode_version(fd.read()) + return parse_readme_unicode_version(fd.read()) -def parse_unicode_version(readme_content): +def parse_readme_unicode_version(readme_content): + # type: (str) -> UnicodeVersion + """ + Parse the Unicode version contained in their ReadMe.txt file. + """ # "raw string" is necessary for \d not being treated as escape char # (for the sake of compat with future Python versions) # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior @@ -164,45 +209,78 @@ def parse_unicode_version(readme_content): return UnicodeVersion(*map(int, groups), as_str=".".join(groups)) +def get_unicode_dir(unicode_version): + # type: (UnicodeVersion) -> str + """ + Indicate where the unicode data files should be stored. + + This returns a full, absolute path. + """ + return os.path.join(FETCH_DIR, unicode_version.as_str) + + def get_unicode_file_path(unicode_version, filename): - return os.path.join(FETCH_DIR, unicode_version.as_str, filename) + # type: (UnicodeVersion, str) -> str + """ + Indicate where the unicode data file should be stored. + """ + return os.path.join(get_unicode_dir(unicode_version), filename) def is_surrogate(n): - return SURROGATE_CODEPOINTS[0] <= n <= SURROGATE_CODEPOINTS[1] + # type: (int) -> bool + """ + Tell if given codepoint is a surrogate (not a valid Rust character). + """ + return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1] def load_unicode_data(file_path): - gencats = {} - to_lower = {} - to_upper = {} - to_title = {} - combines = {} - canon_decomp = {} - compat_decomp = {} - - udict = {} + # type: (str) -> UnicodeData + """ + Load main unicode data. + """ + # conversions + to_lower = {} # type: Dict[int, Tuple[int, int, int]] + to_upper = {} # type: Dict[int, Tuple[int, int, int]] + to_title = {} # type: Dict[int, Tuple[int, int, int]] + + # decompositions + compat_decomp = {} # type: Dict[int, List[int]] + canon_decomp = {} # type: Dict[int, List[int]] + + # combining characters + # FIXME: combines are not used + combines = defaultdict(set) # type: Dict[str, Set[int]] + + # categories + general_categories = defaultdict(set) # type: Dict[str, Set[int]] + category_assigned_codepoints = set() # type: Set[int] + + all_codepoints = {} + range_start = -1 + for line in fileinput.input(file_path): data = line.split(";") if len(data) != 15: continue - cp = int(data[0], 16) - if is_surrogate(cp): + codepoint = int(data[0], 16) + if is_surrogate(codepoint): continue if range_start >= 0: - for i in range(range_start, cp): - udict[i] = data + for i in range(range_start, codepoint): + all_codepoints[i] = data range_start = -1 if data[1].endswith(", First>"): - range_start = cp + range_start = codepoint continue - udict[cp] = data + all_codepoints[codepoint] = data - for code in udict: + for code, data in all_codepoints.items(): (code_org, name, gencat, combine, bidi, decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase) = udict[code] + old, iso, upcase, lowcase, titlecase) = data # generate char to char direct common and simple conversions # uppercase to lowercase @@ -218,46 +296,47 @@ def load_unicode_data(file_path): to_title[code] = (int(titlecase, 16), 0, 0) # store decomposition, if given - if decomp != "": + if decomp: + decompositions = decomp.split()[1:] + decomp_code_points = [int(i, 16) for i in decompositions] + if decomp.startswith("<"): - seq = [] - for i in decomp.split()[1:]: - seq.append(int(i, 16)) - compat_decomp[code] = seq + # compatibility decomposition + compat_decomp[code] = decomp_code_points else: - seq = [] - for i in decomp.split(): - seq.append(int(i, 16)) - canon_decomp[code] = seq + # canonical decomposition + canon_decomp[code] = decomp_code_points # place letter in categories as appropriate - for cat in [gencat, "Assigned"] + EXPANDED_CATEGORIES.get(gencat, []): - if cat not in gencats: - gencats[cat] = [] - gencats[cat].append(code) + for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])): + general_categories[cat].add(code) + category_assigned_codepoints.add(code) # record combining class, if any if combine != "0": - if combine not in combines: - combines[combine] = [] - combines[combine].append(code) + combines[combine].add(code) # generate Not_Assigned from Assigned - gencats["Cn"] = gen_unassigned(gencats["Assigned"]) - # Assigned is not a real category - del(gencats["Assigned"]) + general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints) + # Other contains Not_Assigned - gencats["C"].extend(gencats["Cn"]) - gencats = group_cats(gencats) - combines = to_combines(group_cats(combines)) + general_categories["C"].update(general_categories["Cn"]) + + grouped_categories = group_categories(general_categories) + # FIXME: combines are not used return UnicodeData( - canon_decomp, compat_decomp, gencats, combines, to_upper, - to_lower, to_title, + to_lower=to_lower, to_upper=to_upper, to_title=to_title, + compat_decomp=compat_decomp, canon_decomp=canon_decomp, + general_categories=grouped_categories, combines=combines, ) def load_special_casing(file_path, unicode_data): + # type: (str, UnicodeData) -> None + """ + Load special casing data and enrich given unicode data. + """ for line in fileinput.input(file_path): data = line.split("#")[0].split(";") if len(data) == 5: @@ -277,258 +356,395 @@ def load_special_casing(file_path, unicode_data): (unicode_data.to_upper, upper), (unicode_data.to_title, title)): if values != code: - values = [int(i, 16) for i in values.split()] - for _ in range(len(values), 3): - values.append(0) - assert len(values) == 3 - map_[key] = values - - -def group_cats(cats): - cats_out = {} - for cat in cats: - cats_out[cat] = group_cat(cats[cat]) - return cats_out - - -def group_cat(cat): - cat_out = [] - letters = sorted(set(cat)) - cur_start = letters.pop(0) - cur_end = cur_start - for letter in letters: - assert letter > cur_end, \ - "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) - if letter == cur_end + 1: - cur_end = letter - else: - cat_out.append((cur_start, cur_end)) - cur_start = cur_end = letter - cat_out.append((cur_start, cur_end)) - return cat_out + split = values.split() + + codepoints = list(itertools.chain( + (int(i, 16) for i in split), + (0 for _ in range(len(split), 3)) + )) + + assert len(codepoints) == 3 + map_[key] = codepoints + + +def group_categories(mapping): + # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]] + """ + Group codepoints mapped in "categories". + """ + return {category: group_codepoints(codepoints) + for category, codepoints in mapping.items()} + + +def group_codepoints(codepoints): + # type: (Iterable[int]) -> List[Tuple[int, int]] + """ + Group integral values into continuous, disjoint value ranges. + + Performs value deduplication. + + :return: sorted list of pairs denoting start and end of codepoint + group values, both ends inclusive. + + >>> group_codepoints([1, 2, 10, 11, 12, 3, 4]) + [(1, 4), (10, 12)] + >>> group_codepoints([1]) + [(1, 1)] + >>> group_codepoints([1, 5, 6]) + [(1, 1), (5, 6)] + >>> group_codepoints([]) + [] + """ + sorted_codes = sorted(set(codepoints)) + result = [] # type: List[Tuple[int, int]] + if not sorted_codes: + return result -def ungroup_cat(cat): - cat_out = [] - for (lo, hi) in cat: - while lo <= hi: - cat_out.append(lo) - lo += 1 - return cat_out + next_codes = sorted_codes[1:] + start_code = sorted_codes[0] + for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None): + if next_code is None or next_code - code != 1: + result.append((start_code, code)) + start_code = next_code -def gen_unassigned(assigned): - assigned = set(assigned) - return ([i for i in range(0, 0xd800) if i not in assigned] + - [i for i in range(0xe000, 0x110000) if i not in assigned]) + return result -def to_combines(combs): - combs_out = [] - for comb in combs: - for (lo, hi) in combs[comb]: - combs_out.append((lo, hi, comb)) - combs_out.sort(key=lambda c: c[0]) - return combs_out +def ungroup_codepoints(codepoint_pairs): + # type: (Iterable[Tuple[int, int]]) -> List[int] + """ + The inverse of group_codepoints -- produce a flat list of values + from value range pairs. + + >>> ungroup_codepoints([(1, 4), (10, 12)]) + [1, 2, 3, 4, 10, 11, 12] + >>> ungroup_codepoints([(1, 1), (5, 6)]) + [1, 5, 6] + >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8])) + [1, 2, 7, 8] + >>> ungroup_codepoints([]) + [] + """ + return list(itertools.chain.from_iterable( + range(lo, hi + 1) for lo, hi in codepoint_pairs + )) + + +def get_unassigned_codepoints(assigned_codepoints): + # type: (Set[int]) -> Set[int] + """ + Given a set of "assigned" codepoints, return a set + of these that are not in assigned and not surrogate. + """ + return {i for i in range(0, 0x110000) + if i not in assigned_codepoints and not is_surrogate(i)} + + +def generate_table_lines(items, indent, wrap=98): + # type: (Iterable[str], int, int) -> Iterator[str] + """ + Given table items, generate wrapped lines of text with comma-separated items. + This is a generator function. -def format_table_content(f, content, indent): + :param wrap: soft wrap limit (characters per line), integer. + """ line = " " * indent first = True - for chunk in content.split(","): - if len(line) + len(chunk) < 98: + for item in items: + if len(line) + len(item) < wrap: if first: - line += chunk + line += item else: - line += ", " + chunk + line += ", " + item first = False else: - f.write(line + ",\n") - line = " " * indent + chunk - f.write(line) + yield line + ",\n" + line = " " * indent + item + yield line -def load_properties(file_path, interestingprops): - props = {} - # "raw string" is necessary for \w not to be treated as escape char + +def load_properties(file_path, interesting_props): + # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]] + """ + Load properties data and return in grouped form. + """ + props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]] + # "raw string" is necessary for \. and \w not to be treated as escape chars # (for the sake of compat with future Python versions) # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") for line in fileinput.input(file_path): - prop = None - d_lo = 0 - d_hi = 0 - m = re1.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(1) - prop = m.group(2) - else: - m = re2.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(2) - prop = m.group(3) + match = re1.match(line) or re2.match(line) + if match: + groups = match.groups() + + if len(groups) == 2: + # re1 matched + d_lo, prop = groups + d_hi = d_lo else: - continue - if interestingprops and prop not in interestingprops: + d_lo, d_hi, prop = groups + else: continue - d_lo = int(d_lo, 16) - d_hi = int(d_hi, 16) - if prop not in props: - props[prop] = [] - props[prop].append((d_lo, d_hi)) + + if interesting_props and prop not in interesting_props: + continue + + lo_value = int(d_lo, 16) + hi_value = int(d_hi, 16) + + props[prop].append((lo_value, hi_value)) # optimize if possible for prop in props: - props[prop] = group_cat(ungroup_cat(props[prop])) + props[prop] = group_codepoints(ungroup_codepoints(props[prop])) return props def escape_char(c): - return "'\\u{%x}'" % c if c != 0 else "'\\0'" + # type: (int) -> str + r""" + Escape a codepoint for use as Rust char literal. + Outputs are OK to use as Rust source code as char literals + and they also include necessary quotes. -def emit_table(f, name, t_data, t_type="&[(char, char)]", is_pub=True, - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): + >>> escape_char(97) + "'\\u{61}'" + >>> escape_char(0) + "'\\0'" + """ + return r"'\u{%x}'" % c if c != 0 else r"'\0'" + + +def format_char_pair(pair): + # type: (Tuple[int, int]) -> str + """ + Format a pair of two Rust chars. + """ + return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) + + +def generate_table( + name, # type: str + items, # type: List[Tuple[int, int]] + decl_type="&[(char, char)]", # type: str + is_pub=True, # type: bool + format_item=format_char_pair, # type: Callable[[Tuple[int, int]], str] +): + # type: (...) -> Iterator[str] + """ + Generate a nicely formatted Rust constant "table" array. + + This generates actual Rust code. + """ pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) - data = "" + + yield " %sconst %s: %s = &[\n" % (pub_string, name, decl_type) + + data = [] first = True - for dat in t_data: + for item in items: if not first: - data += "," + data.append(",") first = False - data += pfun(dat) - format_table_content(f, data, 8) - f.write("\n ];\n\n") + data.extend(format_item(item)) + + for table_line in generate_table_lines("".join(data).split(","), 8): + yield table_line + yield "\n ];\n\n" -def compute_trie(rawdata, chunksize): + +def compute_trie(raw_data, chunk_size): + # type: (List[int], int) -> Tuple[List[int], List[int]] + """ + Compute postfix-compressed trie. + + See: bool_trie.rs for more details. + + >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3) + ([0, 0, 1], [1, 2, 3, 4, 5, 6]) + >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3) + ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6]) + """ root = [] - childmap = {} + childmap = {} # type: Dict[Tuple[int, ...], int] child_data = [] - for i in range(len(rawdata) // chunksize): - data = rawdata[i * chunksize: (i + 1) * chunksize] - child = "|".join(map(str, data)) + + assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized" + + for i in range(len(raw_data) // chunk_size): + data = raw_data[i * chunk_size : (i + 1) * chunk_size] + + # postfix compression of child nodes (data chunks) + # (identical child nodes are shared) + + # make a tuple out of the list so it's hashable + child = tuple(data) if child not in childmap: childmap[child] = len(childmap) child_data.extend(data) + root.append(childmap[child]) + return root, child_data -def emit_bool_trie(f, name, t_data, is_pub=True): +def generate_bool_trie(name, codepoint_ranges, is_pub=True): + # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] + """ + Generate Rust code for BoolTrie struct. + + This yields string fragments that should be joined to produce + the final string. + + See: bool_trie.rs + """ chunk_size = 64 rawdata = [False] * 0x110000 - for (lo, hi) in t_data: + for (lo, hi) in codepoint_ranges: for cp in range(lo, hi + 1): rawdata[cp] = True - # convert to bitmap chunks of 64 bits each + # convert to bitmap chunks of chunk_size bits each chunks = [] for i in range(0x110000 // chunk_size): chunk = 0 - for j in range(64): - if rawdata[i * 64 + j]: + for j in range(chunk_size): + if rawdata[i * chunk_size + j]: chunk |= 1 << j chunks.append(chunk) pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)) - f.write(" r1: [\n") - data = ",".join("0x%016x" % chunk for chunk in chunks[0:0x800 // chunk_size]) - format_table_content(f, data, 12) - f.write("\n ],\n") + yield " %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name) + yield " r1: [\n" + data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size]) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" # 0x800..0x10000 trie (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size) - f.write(" r2: [\n") - data = ",".join(str(node) for node in r2) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r3: &[\n") - data = ",".join("0x%016x" % chunk for chunk in r3) - format_table_content(f, data, 12) - f.write("\n ],\n") + yield " r2: [\n" + data = map(str, r2) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r3: &[\n" + data = ("0x%016x" % node for node in r3) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" # 0x10000..0x110000 trie (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size) (r4, r5) = compute_trie(mid, 64) - f.write(" r4: [\n") - data = ",".join(str(node) for node in r4) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r5: &[\n") - data = ",".join(str(node) for node in r5) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r6: &[\n") - data = ",".join("0x%016x" % chunk for chunk in r6) - format_table_content(f, data, 12) - f.write("\n ],\n") - - f.write(" };\n\n") - - -def emit_small_bool_trie(f, name, t_data, is_pub=True): - last_chunk = max(hi // 64 for (lo, hi) in t_data) + + yield " r4: [\n" + data = map(str, r4) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r5: &[\n" + data = map(str, r5) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r6: &[\n" + data = ("0x%016x" % node for node in r6) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " };\n\n" + + +def generate_small_bool_trie(name, codepoint_ranges, is_pub=True): + # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] + """ + Generate Rust code for SmallBoolTrie struct. + + See: bool_trie.rs + """ + last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges) n_chunks = last_chunk + 1 chunks = [0] * n_chunks - for (lo, hi) in t_data: + for (lo, hi) in codepoint_ranges: for cp in range(lo, hi + 1): - if cp // 64 >= len(chunks): - print(cp, cp // 64, len(chunks), lo, hi) + assert cp // 64 < len(chunks) chunks[cp // 64] |= 1 << (cp & 63) pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n" - % (pub_string, name)) + + yield (" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n" + % (pub_string, name)) (r1, r2) = compute_trie(chunks, 1) - f.write(" r1: &[\n") - data = ",".join(str(node) for node in r1) - format_table_content(f, data, 12) - f.write("\n ],\n") + yield " r1: &[\n" + data = (str(node) for node in r1) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r2: &[\n" + data = ("0x%016x" % node for node in r2) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" - f.write(" r2: &[\n") - data = ",".join("0x%016x" % node for node in r2) - format_table_content(f, data, 12) - f.write("\n ],\n") + yield " };\n\n" - f.write(" };\n\n") +def generate_property_module(mod, grouped_categories, category_subset): + # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str] + """ + Generate Rust code for module defining properties. + """ -def emit_property_module(f, mod, tbl, emit): - f.write("pub mod %s {\n" % mod) - for cat in sorted(emit): - if cat in ["Cc", "White_Space", "Pattern_White_Space"]: - emit_small_bool_trie(f, "%s_table" % cat, tbl[cat]) - f.write(" pub fn %s(c: char) -> bool {\n" % cat) - f.write(" %s_table.lookup(c)\n" % cat) - f.write(" }\n\n") + yield "pub mod %s {\n" % mod + for cat in sorted(category_subset): + if cat in ("Cc", "White_Space", "Pattern_White_Space"): + generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat]) else: - emit_bool_trie(f, "%s_table" % cat, tbl[cat]) - f.write(" pub fn %s(c: char) -> bool {\n" % cat) - f.write(" %s_table.lookup(c)\n" % cat) - f.write(" }\n\n") - f.write("}\n\n") + generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat]) + + for fragment in generator: + yield fragment + + yield " pub fn %s(c: char) -> bool {\n" % cat + yield " %s_table.lookup(c)\n" % cat + yield " }\n\n" + + yield "}\n\n" + +def generate_conversions_module(unicode_data): + # type: (UnicodeData) -> Iterator[str] + """ + Generate Rust code for module defining conversions. + """ -def emit_conversions_module(f, unicode_data): - f.write("pub mod conversions {") - f.write(""" + yield "pub mod conversions {" + yield """ pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { None => [c, '\\0', '\\0'], @@ -545,46 +761,39 @@ def emit_conversions_module(f, unicode_data): fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option { table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() - } - -""") - t_type = "&[(char, [char; 3])]" - pfun = lambda x: "(%s,[%s,%s,%s])" % ( - escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])) - - emit_table(f, - name="to_lowercase_table", - t_data=sorted(unicode_data.to_lower.items(), key=operator.itemgetter(0)), - t_type=t_type, - is_pub=False, - pfun=pfun) - - emit_table(f, - name="to_uppercase_table", - t_data=sorted(unicode_data.to_upper.items(), key=operator.itemgetter(0)), - t_type=t_type, - is_pub=False, - pfun=pfun) - - f.write("}\n") - - -def emit_norm_module(f, unicode_data, norm_props): - canon_keys = sorted(unicode_data.canon_decomp.keys()) - - canon_comp = {} - comp_exclusions = norm_props["Full_Composition_Exclusion"] - for char in canon_keys: - if any(lo <= char <= hi for lo, hi in comp_exclusions): - continue - decomp = unicode_data.canon_decomp[char] - if len(decomp) == 2: - if decomp[0] not in canon_comp: - canon_comp[decomp[0]] = [] - canon_comp[decomp[0]].append((decomp[1], char)) + }\n\n""" + + decl_type = "&[(char, [char; 3])]" + format_conversion = lambda x: "({},[{},{},{}])".format(*( + escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2]) + )) + + for fragment in generate_table( + name="to_lowercase_table", + items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]), + decl_type=decl_type, + is_pub=False, + format_item=format_conversion + ): + yield fragment + + for fragment in generate_table( + name="to_uppercase_table", + items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]), + decl_type=decl_type, + is_pub=False, + format_item=format_conversion + ): + yield fragment + + yield "}\n" def parse_args(): + # type: () -> argparse.Namespace + """ + Parse command line arguments. + """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-v", "--version", default=None, type=str, help="Unicode version to use (if not specified," @@ -594,56 +803,63 @@ def parse_args(): def main(): + # type: () -> None + """ + Script entry point. + """ args = parse_args() unicode_version = fetch_files(args.version) print("Using Unicode version: {}".format(unicode_version.as_str)) + # all the writing happens entirely in memory, we only write to file + # once we have generated the file content (it's not very large, <1 MB) + buf = StringIO() + buf.write(PREAMBLE) + + unicode_version_notice = textwrap.dedent(""" + /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of + /// `char` and `str` methods are based on. + #[unstable(feature = "unicode_version", issue = "49726")] + pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{ + major: {version.major}, + minor: {version.minor}, + micro: {version.micro}, + _priv: (), + }}; + """).format(version=unicode_version) + buf.write(unicode_version_notice) + + get_path = lambda f: get_unicode_file_path(unicode_version, f) + + unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA)) + load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data) + + want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", + "Cased", "Case_Ignorable", "Grapheme_Extend"} + derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) + + props = load_properties(get_path(UnicodeFiles.PROPS), + {"White_Space", "Join_Control", "Noncharacter_Code_Point", + "Pattern_White_Space"}) + + # category tables + for (name, categories, category_subset) in ( + ("general_category", unicode_data.general_categories, ["N", "Cc"]), + ("derived_property", derived, want_derived), + ("property", props, ["White_Space", "Pattern_White_Space"]) + ): + for fragment in generate_property_module(name, categories, category_subset): + buf.write(fragment) + + for fragment in generate_conversions_module(unicode_data): + buf.write(fragment) + tables_rs_path = os.path.join(THIS_DIR, "tables.rs") # will overwrite the file if it exists - with open(tables_rs_path, "w") as rf: - rf.write(PREAMBLE) - - unicode_version_notice = textwrap.dedent(""" - /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of - /// `char` and `str` methods are based on. - #[unstable(feature = "unicode_version", issue = "49726")] - pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{ - major: {version.major}, - minor: {version.minor}, - micro: {version.micro}, - _priv: (), - }}; - """).format(version=unicode_version) - rf.write(unicode_version_notice) - - get_path = lambda f: get_unicode_file_path(unicode_version, f) - - unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA)) - load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data) - - want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", - "Cased", "Case_Ignorable", "Grapheme_Extend"] - derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) - - # FIXME scripts not used? - scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), []) - props = load_properties(get_path(UnicodeFiles.PROPS), - ["White_Space", "Join_Control", "Noncharacter_Code_Point", - "Pattern_White_Space"]) - norm_props = load_properties(get_path(UnicodeFiles.DERIVED_NORMALIZATION_PROPS), - ["Full_Composition_Exclusion"]) - - # category tables - for (name, cat, pfuns) in (("general_category", unicode_data.gencats, ["N", "Cc"]), - ("derived_property", derived, want_derived), - ("property", props, ["White_Space", "Pattern_White_Space"])): - emit_property_module(rf, name, cat, pfuns) - - # normalizations and conversions module - emit_norm_module(rf, unicode_data, norm_props) - emit_conversions_module(rf, unicode_data) + with open(tables_rs_path, "w") as fd: + fd.write(buf.getvalue()) print("Regenerated tables.rs.") From 60ccf89693037b3c010b027081d253b9c69a304c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Mon, 10 Jun 2019 20:45:58 +0200 Subject: [PATCH 05/15] Apply suggestions from code review Co-Authored-By: varkor --- src/libcore/unicode/unicode.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index f66e82299100d..9eaf6eb9baa9e 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -36,7 +36,7 @@ try: # completely optional type hinting # (Python 2 compatible using comments, - # see: https://mypy.readthedocs.io/en/latest/python2.html) + # see: https://mypy.readthedocs.io/en/latest/python2.html) # This is very helpful in typing-aware IDE like PyCharm. from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple except ImportError: @@ -95,7 +95,8 @@ class UnicodeFiles(object): "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"], } -# this is the surrogate codepoints range (both ends inclusive) +# This is the (inclusive) range of surrogate codepoints. +# These are not valid Rust characters. # - they are not valid Rust characters SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff) @@ -122,7 +123,7 @@ def fetch_files(version=None): """ Fetch all the Unicode files from unicode.org. - This will use cached files (stored in FETCH_DIR) if they exist, + This will use cached files (stored in `FETCH_DIR`) if they exist, creating them if they don't. In any case, the Unicode version is always returned. @@ -797,7 +798,7 @@ def parse_args(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-v", "--version", default=None, type=str, help="Unicode version to use (if not specified," - " defaults to latest available final release).") + " defaults to latest release).") return parser.parse_args() From 2b47a085dd418447f1dd79986df94dd051f27c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Mon, 10 Jun 2019 21:13:01 +0200 Subject: [PATCH 06/15] Address review remarks in unicode.py --- src/libcore/unicode/unicode.py | 116 +++++++++++++++++---------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index 9eaf6eb9baa9e..a0539cd9ca9b6 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -34,7 +34,7 @@ from StringIO import StringIO try: - # completely optional type hinting + # Completely optional type hinting # (Python 2 compatible using comments, # see: https://mypy.readthedocs.io/en/latest/python2.html) # This is very helpful in typing-aware IDE like PyCharm. @@ -43,9 +43,9 @@ pass -# we don't use enum.Enum because of Python 2.7 compatibility +# We don't use enum.Enum because of Python 2.7 compatibility. class UnicodeFiles(object): - # ReadMe does not contain any unicode data, we + # ReadMe does not contain any Unicode data, we # only use it to extract versions. README = "ReadMe.txt" @@ -57,11 +57,15 @@ class UnicodeFiles(object): UNICODE_DATA = "UnicodeData.txt" -UnicodeFiles.ALL_FILES = tuple( - getattr(UnicodeFiles, name) for name in dir(UnicodeFiles) +# The order doesn't really matter (Python < 3.6 won't preserve it), +# we only want to aggregate all the file names. +ALL_UNICODE_FILES = tuple( + value for name, value in UnicodeFiles.__dict__.items() if not name.startswith("_") ) +assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files" + # The directory this file is located in. THIS_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -97,18 +101,17 @@ class UnicodeFiles(object): # This is the (inclusive) range of surrogate codepoints. # These are not valid Rust characters. -# - they are not valid Rust characters SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff) UnicodeData = namedtuple( "UnicodeData", ( - # conversions: + # Conversions: "to_upper", "to_lower", "to_title", - # decompositions: canonical decompositions, compatibility decomp + # Decompositions: canonical decompositions, compatibility decomp "canon_decomp", "compat_decomp", - # grouped: general categories and combining characters + # Grouped: general categories and combining characters "general_categories", "combines", ) ) @@ -136,10 +139,10 @@ def fetch_files(version=None): return have_version if version: - # check if the desired version exists on the server + # Check if the desired version exists on the server. get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name) else: - # extract the latest version + # Extract the latest version. get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name) readme_url = get_fetch_url(UnicodeFiles.README) @@ -153,14 +156,14 @@ def fetch_files(version=None): download_dir = get_unicode_dir(unicode_version) if not os.path.exists(download_dir): - # for 2.7 compat, we don't use exist_ok=True + # For 2.7 compat, we don't use `exist_ok=True`. os.makedirs(download_dir) - for filename in UnicodeFiles.ALL_FILES: + for filename in ALL_UNICODE_FILES: file_path = get_unicode_file_path(unicode_version, filename) if os.path.exists(file_path): - # assume file on the server didn't change if it's been saved before + # Assume file on the server didn't change if it's been saved before. continue if filename == UnicodeFiles.README: @@ -178,15 +181,16 @@ def check_stored_version(version): # type: (Optional[str]) -> Optional[UnicodeVersion] """ Given desired Unicode version, return the version - if stored files are all present, and None otherwise. + if stored files are all present, and `None` otherwise. """ if not version: - # should always check latest version + # If no desired version specified, we should check what's the latest + # version, skipping stored version checks. return None fetch_dir = os.path.join(FETCH_DIR, version) - for filename in UnicodeFiles.ALL_FILES: + for filename in ALL_UNICODE_FILES: file_path = os.path.join(fetch_dir, filename) if not os.path.exists(file_path): @@ -199,11 +203,11 @@ def check_stored_version(version): def parse_readme_unicode_version(readme_content): # type: (str) -> UnicodeVersion """ - Parse the Unicode version contained in their ReadMe.txt file. + Parse the Unicode version contained in their `ReadMe.txt` file. """ - # "raw string" is necessary for \d not being treated as escape char - # (for the sake of compat with future Python versions) - # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + # "Raw string" is necessary for \d not being treated as escape char + # (for the sake of compat with future Python versions). + # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" groups = re.search(pattern, readme_content).groups() @@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content): def get_unicode_dir(unicode_version): # type: (UnicodeVersion) -> str """ - Indicate where the unicode data files should be stored. + Indicate in which parent dir the Unicode data files should be stored. This returns a full, absolute path. """ @@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version): def get_unicode_file_path(unicode_version, filename): # type: (UnicodeVersion, str) -> str """ - Indicate where the unicode data file should be stored. + Indicate where the Unicode data file should be stored. """ return os.path.join(get_unicode_dir(unicode_version), filename) @@ -239,22 +243,22 @@ def is_surrogate(n): def load_unicode_data(file_path): # type: (str) -> UnicodeData """ - Load main unicode data. + Load main Unicode data. """ - # conversions + # Conversions to_lower = {} # type: Dict[int, Tuple[int, int, int]] to_upper = {} # type: Dict[int, Tuple[int, int, int]] to_title = {} # type: Dict[int, Tuple[int, int, int]] - # decompositions + # Decompositions compat_decomp = {} # type: Dict[int, List[int]] canon_decomp = {} # type: Dict[int, List[int]] - # combining characters + # Combining characters # FIXME: combines are not used combines = defaultdict(set) # type: Dict[str, Set[int]] - # categories + # Categories general_categories = defaultdict(set) # type: Dict[str, Set[int]] category_assigned_codepoints = set() # type: Set[int] @@ -283,41 +287,42 @@ def load_unicode_data(file_path): decomp, deci, digit, num, mirror, old, iso, upcase, lowcase, titlecase) = data - # generate char to char direct common and simple conversions - # uppercase to lowercase + # Generate char to char direct common and simple conversions: + + # Uppercase to lowercase if lowcase != "" and code_org != lowcase: to_lower[code] = (int(lowcase, 16), 0, 0) - # lowercase to uppercase + # Lowercase to uppercase if upcase != "" and code_org != upcase: to_upper[code] = (int(upcase, 16), 0, 0) - # title case + # Title case if titlecase.strip() != "" and code_org != titlecase: to_title[code] = (int(titlecase, 16), 0, 0) - # store decomposition, if given + # Store decomposition, if given if decomp: decompositions = decomp.split()[1:] decomp_code_points = [int(i, 16) for i in decompositions] if decomp.startswith("<"): - # compatibility decomposition + # Compatibility decomposition compat_decomp[code] = decomp_code_points else: - # canonical decomposition + # Canonical decomposition canon_decomp[code] = decomp_code_points - # place letter in categories as appropriate + # Place letter in categories as appropriate. for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])): general_categories[cat].add(code) category_assigned_codepoints.add(code) - # record combining class, if any + # Record combining class, if any. if combine != "0": combines[combine].add(code) - # generate Not_Assigned from Assigned + # Generate Not_Assigned from Assigned. general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints) # Other contains Not_Assigned @@ -336,7 +341,7 @@ def load_unicode_data(file_path): def load_special_casing(file_path, unicode_data): # type: (str, UnicodeData) -> None """ - Load special casing data and enrich given unicode data. + Load special casing data and enrich given Unicode data. """ for line in fileinput.input(file_path): data = line.split("#")[0].split(";") @@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props): Load properties data and return in grouped form. """ props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]] - # "raw string" is necessary for \. and \w not to be treated as escape chars - # (for the sake of compat with future Python versions) - # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars + # (for the sake of compat with future Python versions). + # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") @@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props): groups = match.groups() if len(groups) == 2: - # re1 matched + # `re1` matched (2 groups). d_lo, prop = groups d_hi = d_lo else: @@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props): props[prop].append((lo_value, hi_value)) - # optimize if possible + # Optimize if possible. for prop in props: props[prop] = group_codepoints(ungroup_codepoints(props[prop])) @@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size): for i in range(len(raw_data) // chunk_size): data = raw_data[i * chunk_size : (i + 1) * chunk_size] - # postfix compression of child nodes (data chunks) - # (identical child nodes are shared) + # Postfix compression of child nodes (data chunks) + # (identical child nodes are shared). - # make a tuple out of the list so it's hashable + # Make a tuple out of the list so it's hashable. child = tuple(data) if child not in childmap: childmap[child] = len(childmap) @@ -609,7 +614,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True): This yields string fragments that should be joined to produce the final string. - See: bool_trie.rs + See: `bool_trie.rs`. """ chunk_size = 64 rawdata = [False] * 0x110000 @@ -617,7 +622,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True): for cp in range(lo, hi + 1): rawdata[cp] = True - # convert to bitmap chunks of chunk_size bits each + # Convert to bitmap chunks of `chunk_size` bits each. chunks = [] for i in range(0x110000 // chunk_size): chunk = 0 @@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True): def generate_small_bool_trie(name, codepoint_ranges, is_pub=True): # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] """ - Generate Rust code for SmallBoolTrie struct. + Generate Rust code for `SmallBoolTrie` struct. - See: bool_trie.rs + See: `bool_trie.rs`. """ last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges) n_chunks = last_chunk + 1 @@ -813,8 +818,8 @@ def main(): unicode_version = fetch_files(args.version) print("Using Unicode version: {}".format(unicode_version.as_str)) - # all the writing happens entirely in memory, we only write to file - # once we have generated the file content (it's not very large, <1 MB) + # All the writing happens entirely in memory, we only write to file + # once we have generated the file content (it's not very large, <1 MB). buf = StringIO() buf.write(PREAMBLE) @@ -844,7 +849,7 @@ def main(): {"White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"}) - # category tables + # Category tables for (name, categories, category_subset) in ( ("general_category", unicode_data.general_categories, ["N", "Cc"]), ("derived_property", derived, want_derived), @@ -858,7 +863,8 @@ def main(): tables_rs_path = os.path.join(THIS_DIR, "tables.rs") - # will overwrite the file if it exists + # Actually write out the file content. + # Will overwrite the file if it exists. with open(tables_rs_path, "w") as fd: fd.write(buf.getvalue()) From fea8194a49dac8f330cbdb3c6690b86ae5525904 Mon Sep 17 00:00:00 2001 From: Vadim Kaushan Date: Mon, 1 Jul 2019 23:52:04 +0300 Subject: [PATCH 07/15] Update LLVM: apply patches for pc-relative addressing on 64-bit RISC-V --- src/llvm-project | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llvm-project b/src/llvm-project index 1bbe0b3e1d756..8538d56b2dd45 160000 --- a/src/llvm-project +++ b/src/llvm-project @@ -1 +1 @@ -Subproject commit 1bbe0b3e1d756116cbf1fcf049555066ef929008 +Subproject commit 8538d56b2dd450063547a7690f7ffa2ac37c9c65 From c65ffa789d57004db42f6c30405c59e0a5bae330 Mon Sep 17 00:00:00 2001 From: Vadim Kaushan Date: Mon, 1 Jul 2019 23:52:40 +0300 Subject: [PATCH 08/15] Use code model 'medium' for 64-bit RISC-V targets --- src/librustc_target/spec/riscv64gc_unknown_none_elf.rs | 1 + src/librustc_target/spec/riscv64imac_unknown_none_elf.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/librustc_target/spec/riscv64gc_unknown_none_elf.rs b/src/librustc_target/spec/riscv64gc_unknown_none_elf.rs index a5c13fa28e2ce..8ef197461d92f 100644 --- a/src/librustc_target/spec/riscv64gc_unknown_none_elf.rs +++ b/src/librustc_target/spec/riscv64gc_unknown_none_elf.rs @@ -23,6 +23,7 @@ pub fn target() -> TargetResult { executables: true, panic_strategy: PanicStrategy::Abort, relocation_model: "static".to_string(), + code_model: Some("medium".to_string()), emit_debug_gdb_scripts: false, abi_blacklist: super::riscv_base::abi_blacklist(), eliminate_frame_pointer: false, diff --git a/src/librustc_target/spec/riscv64imac_unknown_none_elf.rs b/src/librustc_target/spec/riscv64imac_unknown_none_elf.rs index 237d615ffcc4b..e8a91f0ef9db0 100644 --- a/src/librustc_target/spec/riscv64imac_unknown_none_elf.rs +++ b/src/librustc_target/spec/riscv64imac_unknown_none_elf.rs @@ -23,6 +23,7 @@ pub fn target() -> TargetResult { executables: true, panic_strategy: PanicStrategy::Abort, relocation_model: "static".to_string(), + code_model: Some("medium".to_string()), emit_debug_gdb_scripts: false, abi_blacklist: super::riscv_base::abi_blacklist(), eliminate_frame_pointer: false, From 45dda939ab558b244d9099798047ec6e6376cff1 Mon Sep 17 00:00:00 2001 From: Andrew Xu Date: Mon, 1 Jul 2019 22:18:45 +0800 Subject: [PATCH 09/15] Move async-await tests from run-pass to ui --- src/test/{run-pass => ui}/async-await/async-fn-size.rs | 5 +++-- src/test/{run-pass => ui}/async-await/issue-60709.rs | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) rename src/test/{run-pass => ui}/async-await/async-fn-size.rs (98%) rename src/test/{run-pass => ui}/async-await/issue-60709.rs (98%) diff --git a/src/test/run-pass/async-await/async-fn-size.rs b/src/test/ui/async-await/async-fn-size.rs similarity index 98% rename from src/test/run-pass/async-await/async-fn-size.rs rename to src/test/ui/async-await/async-fn-size.rs index 7396918196c08..e4dfd4d829e69 100644 --- a/src/test/run-pass/async-await/async-fn-size.rs +++ b/src/test/ui/async-await/async-fn-size.rs @@ -1,9 +1,10 @@ +// run-pass + // edition:2018 #![feature(async_await)] -#[path = "../auxiliary/arc_wake.rs"] -mod arc_wake; +extern crate arc_wake; use std::pin::Pin; use std::future::Future; diff --git a/src/test/run-pass/async-await/issue-60709.rs b/src/test/ui/async-await/issue-60709.rs similarity index 98% rename from src/test/run-pass/async-await/issue-60709.rs rename to src/test/ui/async-await/issue-60709.rs index 5ebb18b999ab6..ad0b49fa4a219 100644 --- a/src/test/run-pass/async-await/issue-60709.rs +++ b/src/test/ui/async-await/issue-60709.rs @@ -2,6 +2,8 @@ // handled incorrectly in generators. // compile-flags: -Copt-level=z -Cdebuginfo=2 --edition=2018 +// run-pass + #![feature(async_await)] #![allow(unused)] From b14a2ec400aa58a71a8e69a05f8bb41f3b571c2b Mon Sep 17 00:00:00 2001 From: Andrew Xu Date: Wed, 3 Jul 2019 22:37:33 +0800 Subject: [PATCH 10/15] Remove duplicated arc_wake.rs The auxiliary file arc_wake.rs is in run-pass/auxiliary and also ui/async-await/auxiliary. Remove the former one as their contents are same. Move run-pass/futures-api.rs to ui/async-await/futures-api.rs as it needs to use arc_wake.rs. --- src/test/run-pass/auxiliary/arc_wake.rs | 64 ------------------- .../async-await}/futures-api.rs | 2 + 2 files changed, 2 insertions(+), 64 deletions(-) delete mode 100644 src/test/run-pass/auxiliary/arc_wake.rs rename src/test/{run-pass => ui/async-await}/futures-api.rs (98%) diff --git a/src/test/run-pass/auxiliary/arc_wake.rs b/src/test/run-pass/auxiliary/arc_wake.rs deleted file mode 100644 index c21886f26f467..0000000000000 --- a/src/test/run-pass/auxiliary/arc_wake.rs +++ /dev/null @@ -1,64 +0,0 @@ -// edition:2018 - -use std::sync::Arc; -use std::task::{ - Waker, RawWaker, RawWakerVTable, -}; - -macro_rules! waker_vtable { - ($ty:ident) => { - &RawWakerVTable::new( - clone_arc_raw::<$ty>, - wake_arc_raw::<$ty>, - wake_by_ref_arc_raw::<$ty>, - drop_arc_raw::<$ty>, - ) - }; -} - -pub trait ArcWake { - fn wake(self: Arc); - - fn wake_by_ref(arc_self: &Arc) { - arc_self.clone().wake() - } - - fn into_waker(wake: Arc) -> Waker where Self: Sized - { - let ptr = Arc::into_raw(wake) as *const (); - - unsafe { - Waker::from_raw(RawWaker::new(ptr, waker_vtable!(Self))) - } - } -} - -unsafe fn increase_refcount(data: *const ()) { - // Retain Arc by creating a copy - let arc: Arc = Arc::from_raw(data as *const T); - let arc_clone = arc.clone(); - // Forget the Arcs again, so that the refcount isn't decrased - let _ = Arc::into_raw(arc); - let _ = Arc::into_raw(arc_clone); -} - -unsafe fn clone_arc_raw(data: *const ()) -> RawWaker { - increase_refcount::(data); - RawWaker::new(data, waker_vtable!(T)) -} - -unsafe fn drop_arc_raw(data: *const ()) { - // Drop Arc - let _: Arc = Arc::from_raw(data as *const T); -} - -unsafe fn wake_arc_raw(data: *const ()) { - let arc: Arc = Arc::from_raw(data as *const T); - ArcWake::wake(arc); -} - -unsafe fn wake_by_ref_arc_raw(data: *const ()) { - let arc: Arc = Arc::from_raw(data as *const T); - ArcWake::wake_by_ref(&arc); - let _ = Arc::into_raw(arc); -} diff --git a/src/test/run-pass/futures-api.rs b/src/test/ui/async-await/futures-api.rs similarity index 98% rename from src/test/run-pass/futures-api.rs rename to src/test/ui/async-await/futures-api.rs index ee77053fd5b6a..a7da058de3081 100644 --- a/src/test/run-pass/futures-api.rs +++ b/src/test/ui/async-await/futures-api.rs @@ -1,3 +1,5 @@ +// run-pass + // aux-build:arc_wake.rs extern crate arc_wake; From 73aee89b47972da57b0f0b1c99ba4e2893ad0a20 Mon Sep 17 00:00:00 2001 From: Andrew Xu Date: Wed, 3 Jul 2019 23:10:03 +0800 Subject: [PATCH 11/15] Move the test async-fn-size-moved-locals to ui --- .../{run-pass => ui}/async-await/async-fn-size-moved-locals.rs | 2 ++ 1 file changed, 2 insertions(+) rename src/test/{run-pass => ui}/async-await/async-fn-size-moved-locals.rs (99%) diff --git a/src/test/run-pass/async-await/async-fn-size-moved-locals.rs b/src/test/ui/async-await/async-fn-size-moved-locals.rs similarity index 99% rename from src/test/run-pass/async-await/async-fn-size-moved-locals.rs rename to src/test/ui/async-await/async-fn-size-moved-locals.rs index 139be7fe0132b..8d24ffe7a7c8c 100644 --- a/src/test/run-pass/async-await/async-fn-size-moved-locals.rs +++ b/src/test/ui/async-await/async-fn-size-moved-locals.rs @@ -7,6 +7,8 @@ // // See issue #59123 for a full explanation. +// run-pass + // edition:2018 #![feature(async_await)] From f115147fc481a4431116be526ce4dec315b1f871 Mon Sep 17 00:00:00 2001 From: Andrew Xu Date: Thu, 4 Jul 2019 21:04:20 +0800 Subject: [PATCH 12/15] Add missing aux-build directive --- src/test/ui/async-await/async-fn-size.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/ui/async-await/async-fn-size.rs b/src/test/ui/async-await/async-fn-size.rs index e4dfd4d829e69..c6b2ed13b0a8d 100644 --- a/src/test/ui/async-await/async-fn-size.rs +++ b/src/test/ui/async-await/async-fn-size.rs @@ -1,5 +1,5 @@ // run-pass - +// aux-build:arc_wake.rs // edition:2018 #![feature(async_await)] From 0d9b477c59d2e4f83e05cd6f475e784f55473dd1 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 19 Dec 2018 18:11:31 +0100 Subject: [PATCH 13/15] rustc: Remove `dylib` crate type from most rustc crates Now that procedural macros no longer link transitively to libsyntax, this shouldn't be needed any more! This commit is an experiment in removing all dynamic libraries from rustc except for librustc_driver itself. Let's see how far we can get with that! --- src/libarena/Cargo.toml | 1 - src/libfmt_macros/Cargo.toml | 1 + src/libgraphviz/Cargo.toml | 1 - src/librustc/Cargo.toml | 2 +- src/librustc/lib.rs | 1 - src/librustc_allocator/Cargo.toml | 1 - src/librustc_apfloat/Cargo.toml | 1 - src/librustc_apfloat/lib.rs | 3 --- src/librustc_borrowck/Cargo.toml | 2 +- src/librustc_codegen_llvm/lib.rs | 1 + src/librustc_codegen_ssa/Cargo.toml | 1 - src/librustc_codegen_utils/Cargo.toml | 1 - src/librustc_cratesio_shim/Cargo.toml | 26 ------------------- src/librustc_cratesio_shim/src/lib.rs | 11 -------- src/librustc_data_structures/Cargo.toml | 3 +-- src/librustc_data_structures/lib.rs | 4 --- src/librustc_errors/Cargo.toml | 3 +-- src/librustc_fs_util/Cargo.toml | 1 - src/librustc_incremental/Cargo.toml | 2 +- src/librustc_interface/Cargo.toml | 2 +- src/librustc_lint/Cargo.toml | 1 - src/librustc_llvm/lib.rs | 4 --- src/librustc_metadata/Cargo.toml | 2 +- src/librustc_mir/Cargo.toml | 2 +- src/librustc_passes/Cargo.toml | 1 - src/librustc_plugin/Cargo.toml | 2 +- src/librustc_privacy/Cargo.toml | 1 - src/librustc_resolve/Cargo.toml | 2 +- src/librustc_save_analysis/Cargo.toml | 1 - src/librustc_target/Cargo.toml | 2 -- src/librustc_target/lib.rs | 4 --- src/librustc_traits/Cargo.toml | 1 - src/librustc_typeck/Cargo.toml | 2 +- src/libserialize/Cargo.toml | 1 - src/libsyntax/Cargo.toml | 2 +- src/libsyntax_ext/Cargo.toml | 2 +- src/libsyntax_pos/Cargo.toml | 2 +- src/test/run-make-fulldeps/issue-19371/foo.rs | 2 ++ 38 files changed, 18 insertions(+), 84 deletions(-) delete mode 100644 src/librustc_cratesio_shim/Cargo.toml delete mode 100644 src/librustc_cratesio_shim/src/lib.rs diff --git a/src/libarena/Cargo.toml b/src/libarena/Cargo.toml index aa1bf38b99597..2643912f6d7d6 100644 --- a/src/libarena/Cargo.toml +++ b/src/libarena/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "arena" path = "lib.rs" -crate-type = ["dylib"] [dependencies] rustc_data_structures = { path = "../librustc_data_structures" } diff --git a/src/libfmt_macros/Cargo.toml b/src/libfmt_macros/Cargo.toml index fc32f21ec4e0a..a95193b85952f 100644 --- a/src/libfmt_macros/Cargo.toml +++ b/src/libfmt_macros/Cargo.toml @@ -11,3 +11,4 @@ crate-type = ["dylib"] [dependencies] syntax_pos = { path = "../libsyntax_pos" } + diff --git a/src/libgraphviz/Cargo.toml b/src/libgraphviz/Cargo.toml index a6a3c1a249d64..4a6e41f760319 100644 --- a/src/libgraphviz/Cargo.toml +++ b/src/libgraphviz/Cargo.toml @@ -7,4 +7,3 @@ edition = "2018" [lib] name = "graphviz" path = "lib.rs" -crate-type = ["dylib"] diff --git a/src/librustc/Cargo.toml b/src/librustc/Cargo.toml index 4d50e80d4cf67..7584df82ac6bc 100644 --- a/src/librustc/Cargo.toml +++ b/src/librustc/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] arena = { path = "../libarena" } diff --git a/src/librustc/lib.rs b/src/librustc/lib.rs index b20f7120bbfa2..8bedbefc0a681 100644 --- a/src/librustc/lib.rs +++ b/src/librustc/lib.rs @@ -46,7 +46,6 @@ #![feature(extern_types)] #![feature(nll)] #![feature(non_exhaustive)] -#![feature(proc_macro_internals)] #![feature(optin_builtin_traits)] #![feature(range_is_empty)] #![feature(rustc_diagnostic_macros)] diff --git a/src/librustc_allocator/Cargo.toml b/src/librustc_allocator/Cargo.toml index cf6c598bfb17b..a964f323c9e7d 100644 --- a/src/librustc_allocator/Cargo.toml +++ b/src/librustc_allocator/Cargo.toml @@ -6,7 +6,6 @@ edition = "2018" [lib] path = "lib.rs" -crate-type = ["dylib"] test = false [dependencies] diff --git a/src/librustc_apfloat/Cargo.toml b/src/librustc_apfloat/Cargo.toml index c7496a9547ea6..af6c2feed0072 100644 --- a/src/librustc_apfloat/Cargo.toml +++ b/src/librustc_apfloat/Cargo.toml @@ -10,5 +10,4 @@ path = "lib.rs" [dependencies] bitflags = "1.0" -rustc_cratesio_shim = { path = "../librustc_cratesio_shim" } smallvec = { version = "0.6.7", features = ["union", "may_dangle"] } diff --git a/src/librustc_apfloat/lib.rs b/src/librustc_apfloat/lib.rs index 1b0bcdd0b5b48..ceade5d278838 100644 --- a/src/librustc_apfloat/lib.rs +++ b/src/librustc_apfloat/lib.rs @@ -35,9 +35,6 @@ #![deny(rust_2018_idioms)] #![feature(nll)] -// See librustc_cratesio_shim/Cargo.toml for a comment explaining this. -#[allow(unused_extern_crates)] -extern crate rustc_cratesio_shim; use std::cmp::Ordering; use std::fmt; diff --git a/src/librustc_borrowck/Cargo.toml b/src/librustc_borrowck/Cargo.toml index f293739dec727..e9abc17202e76 100644 --- a/src/librustc_borrowck/Cargo.toml +++ b/src/librustc_borrowck/Cargo.toml @@ -7,8 +7,8 @@ edition = "2018" [lib] name = "rustc_borrowck" path = "lib.rs" -crate-type = ["dylib"] test = false +doctest = false [dependencies] log = "0.4" diff --git a/src/librustc_codegen_llvm/lib.rs b/src/librustc_codegen_llvm/lib.rs index dbcb20315520b..0f0b9f279175c 100644 --- a/src/librustc_codegen_llvm/lib.rs +++ b/src/librustc_codegen_llvm/lib.rs @@ -39,6 +39,7 @@ extern crate rustc_incremental; extern crate rustc_codegen_utils; extern crate rustc_codegen_ssa; extern crate rustc_fs_util; +extern crate rustc_driver as _; #[macro_use] extern crate log; #[macro_use] extern crate syntax; diff --git a/src/librustc_codegen_ssa/Cargo.toml b/src/librustc_codegen_ssa/Cargo.toml index a4cb517fafed6..343596feed25f 100644 --- a/src/librustc_codegen_ssa/Cargo.toml +++ b/src/librustc_codegen_ssa/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_codegen_ssa" path = "lib.rs" -crate-type = ["dylib"] test = false [dependencies] diff --git a/src/librustc_codegen_utils/Cargo.toml b/src/librustc_codegen_utils/Cargo.toml index b218d18a06ba7..d93589ea84be0 100644 --- a/src/librustc_codegen_utils/Cargo.toml +++ b/src/librustc_codegen_utils/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_codegen_utils" path = "lib.rs" -crate-type = ["dylib"] test = false [dependencies] diff --git a/src/librustc_cratesio_shim/Cargo.toml b/src/librustc_cratesio_shim/Cargo.toml deleted file mode 100644 index 6bdfbe09354b4..0000000000000 --- a/src/librustc_cratesio_shim/Cargo.toml +++ /dev/null @@ -1,26 +0,0 @@ -# This crate exists to allow rustc to link certain crates from crates.io into -# the distribution. This doesn't work normally because: -# -# - Cargo always builds dependencies as rlibs: -# https://github.com/rust-lang/cargo/issues/629 -# - rustc wants to avoid multiple definitions of the same symbol, so it refuses -# to link multiple dylibs containing the same rlib -# - multiple dylibs depend on the same crates.io crates -# -# This solution works by including all the conflicting rlibs in a single dylib, -# which is then linked into all dylibs that depend on these crates.io crates. -# The result is that each rlib only appears once, and things work! - -[package] -authors = ["The Rust Project Developers"] -name = "rustc_cratesio_shim" -version = "0.0.0" -edition = "2018" - -[lib] -crate-type = ["dylib"] - -[dependencies] -bitflags = "1.0" -log = "0.4" -unicode-width = "0.1.4" diff --git a/src/librustc_cratesio_shim/src/lib.rs b/src/librustc_cratesio_shim/src/lib.rs deleted file mode 100644 index 4c170f4f5f6f9..0000000000000 --- a/src/librustc_cratesio_shim/src/lib.rs +++ /dev/null @@ -1,11 +0,0 @@ -#![deny(rust_2018_idioms)] - -// See Cargo.toml for a comment explaining this crate. -#![allow(unused_extern_crates)] - -#![feature(nll)] - -extern crate bitflags; -extern crate log; -extern crate proc_macro; -extern crate unicode_width; diff --git a/src/librustc_data_structures/Cargo.toml b/src/librustc_data_structures/Cargo.toml index cd792d31187bd..acddb3448ca60 100644 --- a/src/librustc_data_structures/Cargo.toml +++ b/src/librustc_data_structures/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc_data_structures" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] ena = "0.13" @@ -15,7 +15,6 @@ indexmap = "1" log = "0.4" jobserver_crate = { version = "0.1.13", package = "jobserver" } lazy_static = "1" -rustc_cratesio_shim = { path = "../librustc_cratesio_shim" } serialize = { path = "../libserialize" } graphviz = { path = "../libgraphviz" } cfg-if = "0.1.2" diff --git a/src/librustc_data_structures/lib.rs b/src/librustc_data_structures/lib.rs index 38dfb675237b5..b479643a5e8cd 100644 --- a/src/librustc_data_structures/lib.rs +++ b/src/librustc_data_structures/lib.rs @@ -38,10 +38,6 @@ extern crate libc; #[macro_use] extern crate cfg_if; -// See librustc_cratesio_shim/Cargo.toml for a comment explaining this. -#[allow(unused_extern_crates)] -extern crate rustc_cratesio_shim; - pub use rustc_serialize::hex::ToHex; #[inline(never)] diff --git a/src/librustc_errors/Cargo.toml b/src/librustc_errors/Cargo.toml index 3689a463a5c84..4df9632cce26b 100644 --- a/src/librustc_errors/Cargo.toml +++ b/src/librustc_errors/Cargo.toml @@ -7,14 +7,13 @@ edition = "2018" [lib] name = "rustc_errors" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] log = "0.4" serialize = { path = "../libserialize" } syntax_pos = { path = "../libsyntax_pos" } rustc_data_structures = { path = "../librustc_data_structures" } -rustc_cratesio_shim = { path = "../librustc_cratesio_shim" } unicode-width = "0.1.4" atty = "0.2" termcolor = "1.0" diff --git a/src/librustc_fs_util/Cargo.toml b/src/librustc_fs_util/Cargo.toml index 47918643f31fe..e74e3809927a0 100644 --- a/src/librustc_fs_util/Cargo.toml +++ b/src/librustc_fs_util/Cargo.toml @@ -7,6 +7,5 @@ edition = "2018" [lib] name = "rustc_fs_util" path = "lib.rs" -crate-type = ["dylib"] [dependencies] diff --git a/src/librustc_incremental/Cargo.toml b/src/librustc_incremental/Cargo.toml index df971ec5bdb85..9678cb4f65545 100644 --- a/src/librustc_incremental/Cargo.toml +++ b/src/librustc_incremental/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc_incremental" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] graphviz = { path = "../libgraphviz" } diff --git a/src/librustc_interface/Cargo.toml b/src/librustc_interface/Cargo.toml index bcaa4216109aa..82880d2198712 100644 --- a/src/librustc_interface/Cargo.toml +++ b/src/librustc_interface/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc_interface" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] log = "0.4" diff --git a/src/librustc_lint/Cargo.toml b/src/librustc_lint/Cargo.toml index fd2b635faefb4..041d0aaead913 100644 --- a/src/librustc_lint/Cargo.toml +++ b/src/librustc_lint/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_lint" path = "lib.rs" -crate-type = ["dylib"] [dependencies] log = "0.4" diff --git a/src/librustc_llvm/lib.rs b/src/librustc_llvm/lib.rs index 292ce8b0a01b0..bdf6b09185735 100644 --- a/src/librustc_llvm/lib.rs +++ b/src/librustc_llvm/lib.rs @@ -4,10 +4,6 @@ #![doc(html_root_url = "https://doc.rust-lang.org/nightly/")] -// See librustc_cratesio_shim/Cargo.toml for a comment explaining this. -#[allow(unused_extern_crates)] -extern crate rustc_cratesio_shim; - // NOTE: This crate only exists to allow linking on mingw targets. /// Initialize targets enabled by the build script via `cfg(llvm_component = "...")`. diff --git a/src/librustc_metadata/Cargo.toml b/src/librustc_metadata/Cargo.toml index 76aba33b6a404..e5c9f1bf2057b 100644 --- a/src/librustc_metadata/Cargo.toml +++ b/src/librustc_metadata/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc_metadata" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] flate2 = "1.0" diff --git a/src/librustc_mir/Cargo.toml b/src/librustc_mir/Cargo.toml index 5de5f5e757119..695bf1f077cd2 100644 --- a/src/librustc_mir/Cargo.toml +++ b/src/librustc_mir/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "rustc_mir" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] arena = { path = "../libarena" } diff --git a/src/librustc_passes/Cargo.toml b/src/librustc_passes/Cargo.toml index 00bdcdc0cc021..de2476775b07e 100644 --- a/src/librustc_passes/Cargo.toml +++ b/src/librustc_passes/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_passes" path = "lib.rs" -crate-type = ["dylib"] [dependencies] log = "0.4" diff --git a/src/librustc_plugin/Cargo.toml b/src/librustc_plugin/Cargo.toml index 5e23aa0d7f74e..7486281c1eac1 100644 --- a/src/librustc_plugin/Cargo.toml +++ b/src/librustc_plugin/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" [lib] name = "rustc_plugin" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] rustc = { path = "../librustc" } diff --git a/src/librustc_privacy/Cargo.toml b/src/librustc_privacy/Cargo.toml index 5bf8024c56911..7cf3a5d6dcde1 100644 --- a/src/librustc_privacy/Cargo.toml +++ b/src/librustc_privacy/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_privacy" path = "lib.rs" -crate-type = ["dylib"] [dependencies] rustc = { path = "../librustc" } diff --git a/src/librustc_resolve/Cargo.toml b/src/librustc_resolve/Cargo.toml index 8e3359c775288..548f982fe3bf0 100644 --- a/src/librustc_resolve/Cargo.toml +++ b/src/librustc_resolve/Cargo.toml @@ -7,8 +7,8 @@ edition = "2018" [lib] name = "rustc_resolve" path = "lib.rs" -crate-type = ["dylib"] test = false +doctest = false [dependencies] bitflags = "1.0" diff --git a/src/librustc_save_analysis/Cargo.toml b/src/librustc_save_analysis/Cargo.toml index 767c726b761f2..88bb76d2aba3a 100644 --- a/src/librustc_save_analysis/Cargo.toml +++ b/src/librustc_save_analysis/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_save_analysis" path = "lib.rs" -crate-type = ["dylib"] [dependencies] log = "0.4" diff --git a/src/librustc_target/Cargo.toml b/src/librustc_target/Cargo.toml index 3ab25146331c1..f1b21365e4bd4 100644 --- a/src/librustc_target/Cargo.toml +++ b/src/librustc_target/Cargo.toml @@ -7,12 +7,10 @@ edition = "2018" [lib] name = "rustc_target" path = "lib.rs" -crate-type = ["dylib"] [dependencies] bitflags = "1.0" log = "0.4" -rustc_cratesio_shim = { path = "../librustc_cratesio_shim" } rustc_data_structures = { path = "../librustc_data_structures" } serialize = { path = "../libserialize" } syntax_pos = { path = "../libsyntax_pos" } diff --git a/src/librustc_target/lib.rs b/src/librustc_target/lib.rs index b65813fd8e38d..c1ec4e59ef239 100644 --- a/src/librustc_target/lib.rs +++ b/src/librustc_target/lib.rs @@ -23,10 +23,6 @@ #[allow(unused_extern_crates)] extern crate serialize as rustc_serialize; // used by deriving -// See librustc_cratesio_shim/Cargo.toml for a comment explaining this. -#[allow(unused_extern_crates)] -extern crate rustc_cratesio_shim; - #[macro_use] extern crate rustc_data_structures; diff --git a/src/librustc_traits/Cargo.toml b/src/librustc_traits/Cargo.toml index da19cc95eb95a..bb28ac839a544 100644 --- a/src/librustc_traits/Cargo.toml +++ b/src/librustc_traits/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "rustc_traits" path = "lib.rs" -crate-type = ["dylib"] [dependencies] bitflags = "1.0" diff --git a/src/librustc_typeck/Cargo.toml b/src/librustc_typeck/Cargo.toml index dcfcd74257e6f..ac3966676838a 100644 --- a/src/librustc_typeck/Cargo.toml +++ b/src/librustc_typeck/Cargo.toml @@ -7,8 +7,8 @@ edition = "2018" [lib] name = "rustc_typeck" path = "lib.rs" -crate-type = ["dylib"] test = false +doctest = false [dependencies] arena = { path = "../libarena" } diff --git a/src/libserialize/Cargo.toml b/src/libserialize/Cargo.toml index fa31a68a75b72..c302bcf95dcad 100644 --- a/src/libserialize/Cargo.toml +++ b/src/libserialize/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [lib] name = "serialize" path = "lib.rs" -crate-type = ["dylib", "rlib"] [dependencies] indexmap = "1" diff --git a/src/libsyntax/Cargo.toml b/src/libsyntax/Cargo.toml index b48f3c9b8b8d8..c5daa6564767e 100644 --- a/src/libsyntax/Cargo.toml +++ b/src/libsyntax/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "syntax" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] bitflags = "1.0" diff --git a/src/libsyntax_ext/Cargo.toml b/src/libsyntax_ext/Cargo.toml index 773f0948a8a10..eafbe6371a3c5 100644 --- a/src/libsyntax_ext/Cargo.toml +++ b/src/libsyntax_ext/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "syntax_ext" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] fmt_macros = { path = "../libfmt_macros" } diff --git a/src/libsyntax_pos/Cargo.toml b/src/libsyntax_pos/Cargo.toml index af7edc0a6bd3e..eebd25d1fafd8 100644 --- a/src/libsyntax_pos/Cargo.toml +++ b/src/libsyntax_pos/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [lib] name = "syntax_pos" path = "lib.rs" -crate-type = ["dylib"] +doctest = false [dependencies] serialize = { path = "../libserialize" } diff --git a/src/test/run-make-fulldeps/issue-19371/foo.rs b/src/test/run-make-fulldeps/issue-19371/foo.rs index 0cbdf40e2f908..3c4f2cd541f4e 100644 --- a/src/test/run-make-fulldeps/issue-19371/foo.rs +++ b/src/test/run-make-fulldeps/issue-19371/foo.rs @@ -2,6 +2,8 @@ extern crate rustc; extern crate rustc_interface; +#[allow(unused_extern_crates)] +extern crate rustc_driver; extern crate syntax; use rustc::session::DiagnosticOutput; From 0c3d5be015e39fb7378f436460ff9681c95ae8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20K=C3=A5re=20Alsaker?= Date: Tue, 9 Apr 2019 10:03:02 +0200 Subject: [PATCH 14/15] Remove some dummy dependencies --- src/librustc/Cargo.toml | 30 ---------------------------- src/librustc/lib.rs | 6 ++---- src/librustc_codegen_llvm/Cargo.toml | 1 + 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/src/librustc/Cargo.toml b/src/librustc/Cargo.toml index 7584df82ac6bc..b6f3bc62dfd29 100644 --- a/src/librustc/Cargo.toml +++ b/src/librustc/Cargo.toml @@ -37,33 +37,3 @@ chalk-engine = { version = "0.9.0", default-features=false } rustc_fs_util = { path = "../librustc_fs_util" } smallvec = { version = "0.6.7", features = ["union", "may_dangle"] } measureme = "0.3" - -# Note that these dependencies are a lie, they're just here to get linkage to -# work. -# -# We're creating a bunch of dylibs for the compiler but we're also compiling a -# bunch of crates.io crates. Everything in the compiler is compiled as an -# rlib/dylib pair but all crates.io crates tend to just be rlibs. This means -# we've got a problem for dependency graphs that look like: -# -# foo - rustc_codegen_llvm -# / \ -# rustc ---- rustc_driver -# \ / -# foo - rustc_metadata -# -# Here the crate `foo` is linked into the `rustc_codegen_llvm` and the -# `rustc_metadata` dylibs, meaning we've got duplicate copies! When we then -# go to link `rustc_driver` the compiler notices this and gives us a compiler -# error. -# -# To work around this problem we just add these crates.io dependencies to the -# `rustc` crate which is a shared dependency above. That way the crate `foo` -# shows up in the dylib for the `rustc` crate, deduplicating it and allowing -# crates like `rustc_codegen_llvm` to use `foo` *through* the `rustc` crate. -# -# tl;dr; this is not needed to get `rustc` to compile, but if you remove it then -# later crate stop compiling. If you can remove this and everything -# compiles, then please feel free to do so! -flate2 = "1.0" -tempfile = "3.0" diff --git a/src/librustc/lib.rs b/src/librustc/lib.rs index 8bedbefc0a681..dc26140ace5a5 100644 --- a/src/librustc/lib.rs +++ b/src/librustc/lib.rs @@ -89,10 +89,8 @@ extern crate serialize as rustc_serialize; #[macro_use] extern crate smallvec; -// Note that librustc doesn't actually depend on these crates, see the note in -// `Cargo.toml` for this crate about why these are here. -#[allow(unused_extern_crates)] -extern crate flate2; +// Use the test crate here so we depend on getopts through it. This allow tools to link to both +// librustc_driver and libtest. #[allow(unused_extern_crates)] extern crate test; diff --git a/src/librustc_codegen_llvm/Cargo.toml b/src/librustc_codegen_llvm/Cargo.toml index 4ae8303c76d3c..291d32a06814d 100644 --- a/src/librustc_codegen_llvm/Cargo.toml +++ b/src/librustc_codegen_llvm/Cargo.toml @@ -13,6 +13,7 @@ test = false [dependencies] cc = "1.0.1" # Used to locate MSVC num_cpus = "1.0" +tempfile = "3.0" rustc-demangle = "0.1.15" rustc_llvm = { path = "../librustc_llvm" } memmap = "0.6" From b1efd0bc0b7e351a06faa0e4e0cbfefabc946b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20K=C3=A5re=20Alsaker?= Date: Fri, 5 Jul 2019 12:23:51 +0200 Subject: [PATCH 15/15] Update Cargo.lock --- Cargo.lock | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d962b134ea289..8fe7f43afeadd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2469,7 +2469,6 @@ dependencies = [ "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "chalk-engine 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", - "flate2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", "fmt_macros 0.0.0", "graphviz 0.0.0", "jobserver 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2492,7 +2491,6 @@ dependencies = [ "smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", "syntax 0.0.0", "syntax_pos 0.0.0", - "tempfile 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -2724,7 +2722,6 @@ name = "rustc_apfloat" version = "0.0.0" dependencies = [ "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc_cratesio_shim 0.0.0", "smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -2762,6 +2759,7 @@ dependencies = [ "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)", "rustc_llvm 0.0.0", + "tempfile 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -2808,15 +2806,6 @@ dependencies = [ "syntax_pos 0.0.0", ] -[[package]] -name = "rustc_cratesio_shim" -version = "0.0.0" -dependencies = [ - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "rustc_data_structures" version = "0.0.0" @@ -2832,7 +2821,6 @@ dependencies = [ "rustc-hash 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-rayon 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-rayon-core 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc_cratesio_shim 0.0.0", "serialize 0.0.0", "smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", "stable_deref_trait 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2881,7 +2869,6 @@ dependencies = [ "annotate-snippets 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc_cratesio_shim 0.0.0", "rustc_data_structures 0.0.0", "serialize 0.0.0", "syntax_pos 0.0.0", @@ -3111,7 +3098,6 @@ version = "0.0.0" dependencies = [ "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc_cratesio_shim 0.0.0", "rustc_data_structures 0.0.0", "serialize 0.0.0", "syntax_pos 0.0.0",