[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
bigscience-workshop · Jul 10, 2023 · 60fe9f1 · 60fe9f1
1 parent c0f9ac7
commit 60fe9f1
Show file tree

Hide file tree

Showing 26 changed files with 31 additions and 44 deletions.
diff --git a/ac_dc/anonymization.py b/ac_dc/anonymization.py
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
  tag_type=tag_type,
  )
  if anonymize_condition:
- for (ent, start, end, tag) in ner:
+ for ent, start, end, tag in ner:
  # we need to actually walk through and replace by start, end span.
  sentence = sentence.replace(ent, f" <{tag}> ")
  return sentence, ner
diff --git a/ac_dc/deduplicate/self_deduplicate.py b/ac_dc/deduplicate/self_deduplicate.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Date : 2022-01-08 22:39:29
 # @Author : Chenghao Mou (mouchenghao@gmail.com)
 # @Description: Self-deduplication with `datasets`
@@ -27,8 +26,7 @@
 
 
 def main(conf: str) -> None:
-
- with open(conf, "r") as f:
+ with open(conf) as f:
  conf = yaml.safe_load(f.read())
 
  if conf["load_from_disk"]["path"]:
@@ -201,5 +199,4 @@ def main(conf: str) -> None:
 
 
 if __name__ == "__main__":
-
  typer.run(main)
diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py
@@ -21,7 +21,6 @@ def __init__(
  path_kenlm_model,
  path_save_stats,
  ):
-
  self.ds = dataset
  self.num_iter = num_iter
 
@@ -166,7 +165,6 @@ def compute_stats(self):
 
 
 if __name__ == "__main__":
-
  lang_dataset_id = "en"
 
  dataset_name = "oscar" # "TurkuNLP/register_oscar"

diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py
@@ -625,7 +625,6 @@ def filtering_of_words(self):
  )
 
  if display_discarded_words_by_filter:
-
  if "len_word" in columns:
  cond_filter = np.invert(conds_words["len_word"])
  Visualization_for_lang.display_dataset(
@@ -698,7 +697,6 @@ def is_doc_discarded(key, score):
  return score < key[1]
 
  if personal_doc:
-
  st.markdown("Statistics of the document:")
 
  for key in self.keys:

diff --git a/bertin/evaluation/run_glue.py b/bertin/evaluation/run_glue.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/bertin/evaluation/run_ner.py b/bertin/evaluation/run_ner.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/bertin/mc4/mc4.py b/bertin/mc4/mc4.py
@@ -404,7 +404,7 @@ def _generate_examples(self, filepaths):
  for filepath in filepaths:
  logger.info("generating examples from = %s", filepath)
  if filepath.endswith("jsonl"):
- with open(filepath, "r", encoding="utf-8") as f:
+ with open(filepath, encoding="utf-8") as f:
  for line in f:
  if line:
  example = json.loads(line)

diff --git a/bertin/run_mlm_flax.py b/bertin/run_mlm_flax.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/bertin/run_mlm_flax_stream.py b/bertin/run_mlm_flax_stream.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
  args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
  data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
 
- with open(os.path.join(save_dir, "training_state.json"), "r") as f:
+ with open(os.path.join(save_dir, "training_state.json")) as f:
  training_state = json.load(f)
  step = training_state["step"]
 

diff --git a/bertin/utils/dataset_perplexity.py b/bertin/utils/dataset_perplexity.py
@@ -17,7 +17,7 @@ def get_perplexity(doc):
 
 
 with open("mc4-es-train-50M-stats.csv", "w") as csv:
- with open("mc4-es-train-50M-steps.jsonl", "r") as data:
+ with open("mc4-es-train-50M-steps.jsonl") as data:
  for line in tqdm(data):
  text = json.loads(line)["text"]
  csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
diff --git a/cc_pseudo_crawl/python_scripts/download_warc.py b/cc_pseudo_crawl/python_scripts/download_warc.py
@@ -143,9 +143,9 @@ def get_warcs(batch):
  existing_compressed_warcs,
  )
 
- batch["compressed_warc"], batch["download_exception"] = [
+ batch["compressed_warc"], batch["download_exception"] = (
  list(l) for l in zip(*warcs_or_exceptions)
- ]
+ )
  return batch
 
 

diff --git a/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py b/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py
@@ -431,7 +431,6 @@ def main(args: PreprocessingConfig) -> None: # Setup logging
  ]
 
  def process_file(file_name: str):
-
  logger.info(config.HF_DATASETS_CACHE)
  processing_name = (
  "-".join(args.metadata_to_include)

diff --git a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
@@ -21,7 +21,7 @@ def main():
 
  seed_ids = []
  for seed_path in args.seed_paths:
- with open(seed_path, "r") as fi:
+ with open(seed_path) as fi:
  data = csv.reader(fi)
  # First line is all the headers that we remove.
  seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]

diff --git a/kenlm_training/cc_net/execution.py b/kenlm_training/cc_net/execution.py
@@ -42,7 +42,6 @@ def get_executor(
  task_parallelism: int = -1,
  options: dict = {},
 ) -> Executor:
-
  execution_mode = execution.split(",")[0]
  options.update(
  {kv.split("=", 1)[0]: kv.split("=", 1)[1] for kv in execution.split(",")[1:]}

diff --git a/kenlm_training/cc_net/jsonql.py b/kenlm_training/cc_net/jsonql.py
@@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs):
  continue
  if "." in k or k == ALL_DOCUMENTS:
  continue
- for line in display_stats(stats, k, weights=weights, **kwargs):
- yield line
+ yield from display_stats(stats, k, weights=weights, **kwargs)
 
 
 def shard(lines):
@@ -961,7 +960,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
  if filename.suffix == ".gz":
  file: TextIO = gzip.open(filename, "rt") # type: ignore
  else:
- file = open(filename, "rt")
+ file = open(filename)
 
  return _close_when_exhausted(file)
 
@@ -1015,7 +1014,7 @@ def open_write(
  if filename.suffix == ".gz":
  return BlockedGzipWriter(Path(filename), mode, block_size="64M")
 
- return open(filename, "wt")
+ return open(filename, "w")
 
 
 def parse_size(size):

diff --git a/kenlm_training/tests/test_jsonql.py b/kenlm_training/tests/test_jsonql.py
@@ -262,7 +262,7 @@ def do(self, x):
  def acc(values):
  print("acc: started")
  res = 0
- for (x, _) in values:
+ for x, _ in values:
  res += int(x)
  print("acc: done")
  yield f"acc: result={res}"

diff --git a/pii-manager/setup.py b/pii-manager/setup.py
@@ -27,15 +27,15 @@
 
 def requirements(filename="requirements.txt"):
  """Read the requirements file"""
- with io.open(filename, "r") as f:
+ with open(filename) as f:
  return [line.strip() for line in f if line and line[0] != "#"]
 
 
 def long_description():
  """
  Take the README and remove markdown hyperlinks
  """
- with open("README.md", "rt", encoding="utf-8") as f:
+ with open("README.md", encoding="utf-8") as f:
  desc = f.read()
  desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M)
  return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X)

diff --git a/pii-manager/src/pii_manager/api/manager.py b/pii-manager/src/pii_manager/api/manager.py
@@ -31,22 +31,19 @@ def fetch_all_tasks(
  """
  taskdict = get_taskdict(debug=debug)
  # Language-independent
- for task in taskdict[LANG_ANY].values():
- yield task
+ yield from taskdict[LANG_ANY].values()
 
  langdict = taskdict.get(lang, {})
  # Country-independent
- for task in langdict.get(COUNTRY_ANY, {}).values():
- yield task
+ yield from langdict.get(COUNTRY_ANY, {}).values()
  # Country-specific
  if country:
  if country[0] in (COUNTRY_ANY, "all"):
  country = country_list(lang)
  for c in country:
  if c == COUNTRY_ANY: # already included above
  continue
- for task in langdict.get(c, {}).values():
- yield task
+ yield from langdict.get(c, {}).values()
 
 
 def fetch_task(

diff --git a/pii-manager/test/unit/api/test_file.py b/pii-manager/test/unit/api/test_file.py
@@ -12,7 +12,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
- with open(name, "rt", encoding="utf-8") as f:
+ with open(name, encoding="utf-8") as f:
  return f.read().strip()
 
 

diff --git a/pii-manager/test/unit/api/test_file_taskfile.py b/pii-manager/test/unit/api/test_file_taskfile.py
@@ -14,7 +14,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
- with open(name, "rt", encoding="utf-8") as f:
+ with open(name, encoding="utf-8") as f:
  return f.read().strip()
 
 

diff --git a/pii-manager/test/unit/api/test_manager.py b/pii-manager/test/unit/api/test_manager.py
@@ -21,7 +21,10 @@ def test20_info():
  info = obj.task_info()
 
  exp = {
- (PiiEnum.CREDIT_CARD, None,): [
+ (
+ PiiEnum.CREDIT_CARD,
+ None,
+ ): [
  (
  "credit card",
  "Credit card numbers for most international credit cards (detect & validate)",

diff --git a/pii-manager/test/unit/api/test_manager_add.py b/pii-manager/test/unit/api/test_manager_add.py
@@ -47,7 +47,7 @@ def test110_call():
  obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS)
  obj.add_tasks([DUMMY_REGEX])
 
- for (doc, exp) in TEST_REGEX:
+ for doc, exp in TEST_REGEX:
  got = obj(doc)
  assert got == exp
 
@@ -86,6 +86,6 @@ def test200_call():
  obj = PiiManager("en")
  obj.add_tasks([DUMMY_CLASS])
 
- for (doc, exp) in TEST_CLASS:
+ for doc, exp in TEST_CLASS:
  got = obj(doc)
  assert got == exp
diff --git a/pii-manager/test/unit/api/test_manager_ctx.py b/pii-manager/test/unit/api/test_manager_ctx.py
@@ -38,7 +38,7 @@ def test10_context_regex():
  """
  obj = PiiManager("en", mode="extract")
  obj.add_tasks([DUMMY_REGEX])
- for (text, exp) in TEST:
+ for text, exp in TEST:
  got = obj(text)
  assert list(got) == exp
 
@@ -64,6 +64,6 @@ def test20_context_class():
  """
  obj = PiiManager("en", mode="extract")
  obj.add_tasks([DUMMY_CLASS])
- for (text, exp) in TEST:
+ for text, exp in TEST:
  got = obj(text)
  assert list(got) == exp
diff --git a/pii-manager/test/unit/helper/test_context.py b/pii-manager/test/unit/helper/test_context.py
@@ -74,7 +74,7 @@ def test10_context_true():
  """
  Check valid contexts
  """
- for (text, context) in TEST_TRUE:
+ for text, context in TEST_TRUE:
  spec = mod.context_spec(context)
  assert mod.context_check(text, spec, 20) is True
 
@@ -83,7 +83,7 @@ def test20_context_false():
  """
  Check invalid contexts
  """
- for (text, context) in TEST_FALSE:
+ for text, context in TEST_FALSE:
  spec = mod.context_spec(context)
  assert mod.context_check(text, spec, 20) is False
 

diff --git a/pii-manager/test/unit/helper/test_norm.py b/pii-manager/test/unit/helper/test_norm.py
@@ -8,5 +8,5 @@ def test10_normalizer():
  """
  Create base object
  """
- for (text, exp) in TEST:
+ for text, exp in TEST:
  assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp
diff --git a/tokenizer/python_script/dedup_lines.py b/tokenizer/python_script/dedup_lines.py
@@ -28,6 +28,7 @@
 
 META_COLUMNS = ["meta"]
 
+
 # filter text to remove certain lines (e.g. menu items, copyright notice)
 def filter_lines(article, skip_set, used_lines):
  # TODO discuss the strip