From 662c2f19b450fd38746345c14f780d3cfbc5aacd Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 5 Jul 2022 10:16:19 +0000
Subject: [PATCH 1/3] ignore whitspaces for hash

---
 .../research_projects/codeparrot/scripts/preprocessing.py   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 3d4ec40dec77dd..1c637b679ac4b0 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -3,6 +3,7 @@
 import json
 import multiprocessing
 import os
+import re
 import shutil
 import time
 from pathlib import Path
@@ -15,9 +16,12 @@
 from transformers import AutoTokenizer, HfArgumentParser
 
 
+PATTERN = re.compile(r'\s+')
+
+
 def get_hash(example):
     """Get hash of content field."""
-    return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
+    return {"hash": hashlib.md5(re.sub(PATTERN, '', example["content"]).encode("utf-8")).hexdigest()}
 
 
 def line_stats(example):

From 7bfe478d354190f574405ef15aa5fd34af6a70e1 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 5 Jul 2022 10:17:15 +0000
Subject: [PATCH 2/3] reformat code

---
 .../research_projects/codeparrot/scripts/preprocessing.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 1c637b679ac4b0..6236a8aad86aa1 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -16,12 +16,12 @@
 from transformers import AutoTokenizer, HfArgumentParser
 
 
-PATTERN = re.compile(r'\s+')
+PATTERN = re.compile(r"\s+")
 
 
 def get_hash(example):
     """Get hash of content field."""
-    return {"hash": hashlib.md5(re.sub(PATTERN, '', example["content"]).encode("utf-8")).hexdigest()}
+    return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
 
 
 def line_stats(example):

From abf600ee6e8422c922d900876ecc43bda7669e68 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:51:48 +0200
Subject: [PATCH 3/3] Update README.md

---
 examples/research_projects/codeparrot/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 4e70381a746cac..ef92606c545a78 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -39,7 +39,7 @@ The source of the dataset is the GitHub dump available on Google's [BigQuery](ht
 ### Preprocessing
 The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:
 
-- exact deduplication using each file's hash
+- exact deduplication using each file's hash after having removed whistespaces.
 - near deduplication using MinHash and Jaccard similarity. MinHash with a Jaccard threshold (default=0.85) is first used to create duplicate clusters. Then these clusters are then reduced to unique files based on the exact Jaccard similarity. See `deduplicate_dataset` in `minhash_deduplication.py` for a detailed description.
 - filtering files with max line length > 1000
 - filtering files with mean line length > 100