From 137cf2043b7fb35569f31b94cf7d22e7a64a5953 Mon Sep 17 00:00:00 2001
From: Paulo-Mansano
Date: Sun, 29 Jun 2025 21:38:26 +0200
Subject: [PATCH] New classes: ColumnEncoderRagged,
TransformDummySeparatedTest, TransformRaggedTest. Modified: TfUtils,
ColumnEncoder, EncoderFactory, TransformCustomTest
---
hello.dml | 1 +
.../sysds/runtime/transform/TfUtils.java | 2 +-
.../transform/encode/ColumnEncoder.java | 4 +-
.../transform/encode/ColumnEncoderRagged.java | 197 ++++++++++++++++
.../transform/encode/EncoderFactory.java | 11 +-
.../frame/transform/TransformCustomTest.java | 5 +-
.../TransformDummySeparatedTest.java | 74 ++++++
.../frame/transform/TransformRaggedTest.java | 218 ++++++++++++++++++
8 files changed, 505 insertions(+), 7 deletions(-)
create mode 100644 hello.dml
create mode 100644 src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java
create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformDummySeparatedTest.java
create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformRaggedTest.java
diff --git a/hello.dml b/hello.dml
new file mode 100644
index 00000000000..03be2397d3c
--- /dev/null
+++ b/hello.dml
@@ -0,0 +1 @@
+print("Hello SystemDS")
diff --git a/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java b/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java
index 67ee1776f1c..cbeb4cfa360 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java
@@ -47,7 +47,7 @@ protected byte toID() {
//transform methods
public enum TfMethod {
- IMPUTE, RECODE, HASH, BIN, DUMMYCODE, UDF, OMIT, WORD_EMBEDDING, BAG_OF_WORDS;
+ IMPUTE, RECODE, HASH, BIN, DUMMYCODE, UDF, OMIT, WORD_EMBEDDING, BAG_OF_WORDS, RAGGED;
@Override
public String toString() {
return name().toLowerCase();
diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java
index 037e7bea1d7..0ac6ec9f924 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java
@@ -75,7 +75,7 @@ public void initEmbeddings(MatrixBlock embeddings){
}
protected enum TransformType{
- BIN, RECODE, DUMMYCODE, FEATURE_HASH, PASS_THROUGH, UDF, WORD_EMBEDDING, BAG_OF_WORDS, N_A
+ BIN, RECODE, DUMMYCODE, FEATURE_HASH, PASS_THROUGH, UDF, WORD_EMBEDDING, BAG_OF_WORDS, RAGGED, N_A
}
protected ColumnEncoder(int colID) {
@@ -447,7 +447,7 @@ protected void setApplyRowBlocksPerColumn(int nPart) {
}
public enum EncoderType {
- Recode, FeatureHash, PassThrough, Bin, Dummycode, Omit, MVImpute, Composite, WordEmbedding, BagOfWords
+ Recode, FeatureHash, PassThrough, Bin, Dummycode, Omit, MVImpute, Composite, WordEmbedding, BagOfWords, Ragged
}
/*
diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java
new file mode 100644
index 00000000000..a326c71af73
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java
@@ -0,0 +1,197 @@
+package org.apache.sysds.runtime.transform.encode;
+
+import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
+import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.transform.TfUtils;
+import org.apache.sysds.runtime.transform.TfUtils.TfMethod;
+import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.common.Types.ValueType;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Encodes a column using ragged array/dictionary representation to optimize memory usage.
+ * Stores unique values in a dictionary and replaces occurrences with indices.
+ */
+public class ColumnEncoderRagged extends ColumnEncoder {
+ private static final long serialVersionUID = 2291732648968734088L;
+
+ // Dictionary storage
+ private Object[] _dict;
+ private int _dictSize;
+ private int _nullIndex = -1;
+
+ // Reverse mapping for fast lookups
+ private transient Map