Skip to content

Commit

Permalink
more fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <touma@us.ibm.com>
  • Loading branch information
touma-I committed Dec 18, 2024
1 parent 470152f commit b95e99e
Show file tree
Hide file tree
Showing 9 changed files with 436 additions and 169 deletions.
8 changes: 1 addition & 7 deletions transforms/universal/fdedup/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,4 @@ TRANSFORM_NAME=$(shell basename `pwd`)
################################################################################




un-cli-sample:
$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \
--fdedup_id_column int_id_column" \
.transforms.run-src-file

36 changes: 15 additions & 21 deletions transforms/universal/fdedup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,15 @@ make venv
Subsequently, the main orchestration program can run with:
```commandline
source venv/bin/activate
cd dpk_fdedup
python transform_python.py
python -m dpk_fdedup.transform_python
```
Alternatively the transforms included in fuzzy dedup can be launched independently:
```commandline
source venv/bin/activate
cd dpk_fdedup
python signature_calc/local_python.py
python cluster_analysis/local_python.py
python get_duplicate_list/transform_local_python.py
python data_cleaning/local_python.py
python -m dpk_fdedup.signature_calc.local_python
python -m dpk_fdedup.cluster_analysis.local_python
python -m dpk_fdedup.get_duplicate_list.transform_local_python
python -m dpk_fdedup.data_cleaning.local_python
```
After running the transforms, execute:
```shell
Expand Down Expand Up @@ -266,17 +264,15 @@ make venv
Subsequently, the main orchestration program can run with:
```commandline
source venv/bin/activate
cd dpk_fdedup
python ray/transform.py
python -m dpk_fdedup.ray.transform
```
Alternatively the transforms included in fuzzy dedup can be launched independently:
```commandline
source venv/bin/activate
cd dpk_fdedup
python signature_calc/ray/local.py
python cluster_analysis/ray/local.py
python get_duplicate_list/ray/tarnsform.py
python data_cleaning/ray/local.py
python -m dpk_fdedup.signature_calc.ray.local
python -m dpk_fdedup.cluster_analysis.ray.local
python -m dpk_fdedup.get_duplicate_list.ray.tarnsform
python -m dpk_fdedup.data_cleaning.ray.local
```
After running the transforms, execute:
```shell
Expand Down Expand Up @@ -340,17 +336,15 @@ make venv
Subsequently, the main orchestration program can run with:
```commandline
source venv/bin/activate
cd dpk_fdedup
python spark/transform.py
python -m dpk_fdedup.spark.transform
```
Alternatively the transforms included in fuzzy dedup can be launched independently:
```commandline
source venv/bin/activate
cd dpk_fdedup
python signature_calc/spark/local.py
python cluster_analysis/spark/local.py
python get_duplicate_list/spark/transform.py
python data_cleanin/spark/local.py
python -m dpk_fdedup.signature_calc.spark.local
python -m dpk_fdedup.cluster_analysis.spark.local
python -m dpk_fdedup.get_duplicate_list.transform
python -m dpk_fdedup.data_cleaning.spark.local
```
After running the transforms, execute:
```shell
Expand Down
10 changes: 5 additions & 5 deletions transforms/universal/fdedup/dpk_fdedup/spark/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@
import os
import sys

from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration
from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing_spark.runtime.spark import SparkTransformLauncher
from fdedup_transform_python import ServiceOrchestrator, parse_args
from get_duplicate_list_transform_python import (
from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args
from dpk_fdedup.get_duplicate_list.transform_python import (
GetDuplicateListPythonTransformConfiguration,
)
from signature_calc_transform_spark import (
from dpk_fdedup.signature_calc.spark.transform import (
SignatureCalculationSparkTransformConfiguration,
)

Expand Down
24 changes: 12 additions & 12 deletions transforms/universal/fdedup/dpk_fdedup/transform_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@
import os
import sys

import cluster_analysis_transform
import data_cleaning_transform
import get_duplicate_list_transform
import signature_calc_transform
from cluster_analysis.transform_python import (
import dpk_fdedup.cluster_analysis.transform
import dpk_fdedup.data_cleaning.transform
import dpk_fdedup.get_duplicate_list.transform
import dpk_fdedup.signature_calc.transform
from dpk_fdedup.cluster_analysis.transform_python import (
ClusterAnalysisPythonTransformConfiguration,
)
from data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils, get_logger, str2bool
from get_duplicate_list.transform_python import (
from dpk_fdedup.get_duplicate_list.transform_python import (
GetDuplicateListPythonTransformConfiguration,
)
from signature_calc.transform_python import (
from dpk_fdedup.signature_calc.transform_python import (
SignatureCalculationPythonTransformConfiguration,
)

Expand All @@ -47,10 +47,10 @@
}

ARGS_MAP = {
"minhash": signature_calc_transform.captured_arg_keys,
"cluster": cluster_analysis_transform.captured_arg_keys,
"fdlist": get_duplicate_list_transform.captured_arg_keys,
"fdclean": data_cleaning_transform.captured_arg_keys,
"minhash": dpk_fdedup.signature_calc.transform.captured_arg_keys,
"cluster": dpk_fdedup.cluster_analysis.transform.captured_arg_keys,
"fdlist": dpk_fdedup.get_duplicate_list.transform.captured_arg_keys,
"fdclean": dpk_fdedup.data_cleaning.transform.captured_arg_keys,
}


Expand Down
Loading

0 comments on commit b95e99e

Please sign in to comment.