Merge pull request IBM#878 from IBM/ededup-simplify

Refactored ededup with own dpk_ededup namespace
touma-I · Dec 17, 2024 · d4ffb13 · d4ffb13
2 parents 03b7e09 + b6c7b44
commit d4ffb13
Show file tree

Hide file tree

Showing 66 changed files with 751 additions and 870 deletions.
diff --git a/scripts/k8s-setup/populate_minio.sh b/scripts/k8s-setup/populate_minio.sh
@@ -38,7 +38,7 @@ mc cp --recursive ${REPOROOT}/transforms/language/doc_chunk/test-data/input/ kfp
 mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/test-data/input/test1.html kfp/test/html2parquet/input
 # universal
 mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/test-data/input/ kfp/test/doc_id/input
-mc cp --recursive ${REPOROOT}/transforms/universal/ededup/ray/test-data/input/ kfp/test/ededup/input
+mc cp --recursive ${REPOROOT}/transforms/universal/ededup/test-data/input/ kfp/test/ededup/input
 mc cp --recursive ${REPOROOT}/transforms/universal/fdedup/ray/test-data/input/ kfp/test/fdedup/input
 mc cp --recursive ${REPOROOT}/transforms/universal/filter/ray/test-data/input/ kfp/test/filter/input
 mc cp --recursive ${REPOROOT}/transforms/universal/noop/ray/test-data/input/ kfp/test/noop/input

diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt
@@ -52,7 +52,7 @@ packaging==24.0
 pandas==2.2.2
 parso==0.8.4
 pexpect==4.9.0
-pillow==10.3.0
+pillow>=10.3.0
 platformdirs==4.2.2
 prompt_toolkit==3.0.45
 protobuf==5.27.2

diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt
@@ -1,3 +1,3 @@
 data-prep-toolkit>=0.2.3
 bs4==0.0.2
-transformers==4.38.2
+transformers>=4.38.2
diff --git a/transforms/language/lang_id/requirements.txt b/transforms/language/lang_id/requirements.txt
@@ -1,4 +1,4 @@
 fasttext==0.9.2
-langcodes==3.3.0
+langcodes>=3.3.0
 huggingface-hub >= 0.21.4, <1.0.0
 numpy==1.26.4
diff --git a/transforms/language/text_encoder/requirements.txt b/transforms/language/text_encoder/requirements.txt
@@ -1 +1 @@
-sentence-transformers==3.0.1
+sentence-transformers>=3.0.1
diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
@@ -62,7 +62,7 @@ language = { file = [
 
 "universal/hap/python/requirements.txt",
 "universal/tokenization/python/requirements.txt",
-"universal/ededup/python/requirements.txt",
+"universal/ededup/requirements.txt",
 "universal/fdedup/python/requirements.txt",
 
 "language/doc_quality/requirements.txt",
@@ -93,7 +93,6 @@ code_profiler = { file = ["code/code_profiler/python/requirements.txt"]}
 
 pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} 
 
-ededup = { file = ["universal/ededup/python/requirements.txt"]} 
 fdedup = { file = ["universal/fdedup/python/requirements.txt"]} 
 profiler = { file = ["universal/profiler/python/requirements.txt"]} 
 filter = { file = ["universal/filter/python/requirements.txt"]} 
@@ -110,6 +109,8 @@ text_encoder = { file = ["language/text_encoder/requirements.txt"]}
 
 doc_id = { file = ["universal/doc_id/requirements.txt"]}
 hap = { file = ["universal/hap/requirements.txt"]}
+ededup = { file = ["universal/ededup/requirements.txt"]} 
+
 web2parquet = { file = ["universal/web2parquet/requirements.txt"]}
 
 # Does not seem to work for our custom layout
@@ -128,6 +129,7 @@ dpk_pdf2parquet = "language/pdf2parquet/dpk_pdf2parquet"
 dpk_text_encoder = "language/text_encoder/dpk_text_encoder"
 dpk_doc_id = "universal/doc_id/dpk_doc_id"
 dpk_hap = "universal/hap/dpk_hap"
+dpk_ededup = "universal/ededup/dpk_ededup"
 
 #[tool.setuptools.package-data]
 #"*" = ["*.txt"]

diff --git a/transforms/transforms-1.0-lang.ipynb b/transforms/transforms-1.0-lang.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 38,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -21,9 +21,17 @@
     "import pandas as pd"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c276c60e",
+   "metadata": {},
+   "source": [
+    "configur and run web2parquet"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 39,
    "id": "b6c89ac7-6824-4d99-8120-7d5b150bd683",
    "metadata": {},
    "outputs": [],
@@ -35,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 40,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
    "outputs": [],
@@ -50,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 25,
    "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
    "metadata": {},
    "outputs": [],
@@ -60,43 +68,20 @@
     "#glob.glob(\"downloads/*\") "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bd71fe8a",
+   "metadata": {},
+   "source": [
+    "Configure and run Pdf2Parquet"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "10:55:10 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.MARKDOWN: 'text/markdown'>, 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': <pdf2parquet_ocr_engine.EASYOCR: 'easyocr'>, 'bitmap_area_threshold': 0.05, 'pdf_backend': <pdf2parquet_pdf_backend.DLPARSE_V2: 'dlparse_v2'>, 'double_precision': 8}\n",
-      "10:55:10 INFO - pipeline id pipeline_id\n",
-      "10:55:10 INFO - code location None\n",
-      "10:55:10 INFO - data factory data_ is using local data access: input_folder - downloads output_folder - pdf2parquet-files\n",
-      "10:55:10 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:55:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
-      "10:55:10 INFO - orchestrator pdf2parquet started at 2024-12-14 10:55:10\n",
-      "10:55:10 INFO - Number of files is 1, source profile {'max_file_size': 5.308699607849121, 'min_file_size': 5.308699607849121, 'total_file_size': 5.308699607849121}\n",
-      "10:55:10 INFO - Initializing models\n",
-      "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 20015.24it/s]\n",
-      "10:56:06 INFO - Completed 1 files (100.0%) in 0.847 min\n",
-      "10:56:06 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "10:56:06 INFO - done flushing in 0.0 sec\n",
-      "10:56:07 INFO - Completed execution in 0.941 min, execution result 0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from dpk_pdf2parquet.transform_python import Pdf2Parquet\n",
     "Pdf2Parquet(input_folder= \"downloads\", \n",
@@ -107,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 29,
    "id": "fef6667e-71ed-4054-9382-55c6bb3fda70",
    "metadata": {},
    "outputs": [],
@@ -117,30 +102,20 @@
     "#table.to_pandas()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "54cba5c4",
+   "metadata": {},
+   "source": [
+    "Configure and Run DocChunk"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "fe8bf1bc",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "10:56:09 INFO - pipeline id pipeline_id\n",
-      "10:56:09 INFO - code location None\n",
-      "10:56:09 INFO - data factory data_ is using local data access: input_folder - pdf2parquet-files output_folder - doc-chunk-files\n",
-      "10:56:09 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:56:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:56:09 INFO - orchestrator doc_chunk started at 2024-12-14 10:56:09\n",
-      "10:56:09 INFO - Number of files is 1, source profile {'max_file_size': 0.023062705993652344, 'min_file_size': 0.023062705993652344, 'total_file_size': 0.023062705993652344}\n",
-      "10:56:09 INFO - Completed 1 files (100.0%) in 0.001 min\n",
-      "10:56:09 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "10:56:09 INFO - done flushing in 0.0 sec\n",
-      "10:56:09 INFO - Completed execution in 0.001 min, execution result 0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%capture\n",
     "from dpk_doc_chunk.transform_python import DocChunk\n",
@@ -151,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 31,
    "id": "9d4f7bfc",
    "metadata": {},
    "outputs": [],
@@ -161,45 +136,57 @@
     "#table.to_pandas()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "349cf6ff",
+   "metadata": {},
+   "source": [
+    "Configure and Run Exact dedup"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
+   "id": "38480cd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dpk_ededup.transform_python import Ededup\n",
+    "Ededup(input_folder=\"doc-chunk-files\",\n",
+    "    output_folder=\"dedup-files\",\n",
+    "    ededup_doc_column=\"contents\",\n",
+    "    ededup_doc_id_column=\"document_id\").transform()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "27e36a8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##### **** To explote the output from eDedup, run the code below\n",
+    "#table = pq.read_table('dedup-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
+    "#table.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "318bc520",
+   "metadata": {},
+   "source": [
+    "Configure and run Land Id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "ad27a462",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "10:57:06 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'contents', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
-      "10:57:06 INFO - pipeline id pipeline_id\n",
-      "10:57:06 INFO - code location None\n",
-      "10:57:06 INFO - data factory data_ is using local data access: input_folder - dedup-files output_folder - langId-files\n",
-      "10:57:06 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:57:06 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:57:06 INFO - orchestrator lang_id started at 2024-12-14 10:57:06\n",
-      "10:57:06 INFO - Number of files is 1, source profile {'max_file_size': 0.031200408935546875, 'min_file_size': 0.031200408935546875, 'total_file_size': 0.031200408935546875}\n",
-      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
-      "10:57:08 INFO - Completed 1 files (100.0%) in 0.001 min\n",
-      "10:57:08 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "10:57:08 INFO - done flushing in 0.0 sec\n",
-      "10:57:08 INFO - Completed execution in 0.036 min, execution result 0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from dpk_lang_id.transform_python import LangId\n",
-    "LangId(input_folder= \"doc-chunk-files\",\n",
+    "LangId(input_folder= \"dedup-files\",\n",
     "        output_folder= \"langId-files\",\n",
     "        lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
     "        lang_id_model_kind= \"fasttext\",\n",
@@ -209,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 35,
    "id": "c35cab2e",
    "metadata": {},
    "outputs": [],
@@ -219,42 +206,32 @@
     "#table.to_pandas()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a968dbb4",
+   "metadata": {},
+   "source": [
+    "Configure and run Doc Quality"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "4e84ce78",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "10:57:13 INFO - pipeline id pipeline_id\n",
-      "10:57:13 INFO - code location None\n",
-      "10:57:13 INFO - data factory data_ is using local data access: input_folder - dedup-files output_folder - doc-quality-files\n",
-      "10:57:13 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:57:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:57:13 INFO - orchestrator docq started at 2024-12-14 10:57:13\n",
-      "10:57:13 INFO - Number of files is 1, source profile {'max_file_size': 0.031200408935546875, 'min_file_size': 0.031200408935546875, 'total_file_size': 0.031200408935546875}\n",
-      "10:57:13 INFO - Completed 1 files (100.0%) in 0.003 min\n",
-      "10:57:13 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "10:57:13 INFO - done flushing in 0.0 sec\n",
-      "10:57:13 INFO - Completed execution in 0.003 min, execution result 0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%capture\n",
     "from dpk_doc_quality.transform_python import DocQuality\n",
-    "DocQuality(input_folder='doc-chunk-files',\n",
+    "DocQuality(input_folder='dedup-files',\n",
     "            output_folder= 'doc-quality-files',\n",
     "            docq_text_lang = \"en\",\n",
     "            docq_doc_content_column =\"contents\").transform()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 37,
    "id": "d98b854f",
    "metadata": {},
    "outputs": [],
@@ -267,7 +244,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		sentence-transformers==3.0.1
		sentence-transformers>=3.0.1