Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
365d2b8
Basic Structure for ColumnDecoder
Isso-W May 17, 2025
d5dc7c1
Basic Structure for ColumnDecoder
Isso-W Jun 9, 2025
3712b9c
Main logic in ColumnDecoderComposite done, ColumnDecoderBin as exampl…
Isso-W Jun 9, 2025
ca836a4
update two encode methods for PassThrough and Recode
EmilyHongYT Jun 10, 2025
f43a853
Main logic in ColumnDecoderComposite done, ColumnDecoderBin as exampl…
Isso-W Jun 12, 2025
84077b0
Mixed test and optimization on Pass through
Isso-W Jun 13, 2025
8fcbbcc
Middle version
Isso-W Jun 15, 2025
c02fa76
Middle version
Isso-W Jun 15, 2025
3c66fcc
Middle version
Isso-W Jun 15, 2025
8da2e0d
Middle version
Isso-W Jun 16, 2025
19a4437
Middle version
Isso-W Jun 16, 2025
34a4341
Bin, Recode, Passthrough done
Isso-W Jun 16, 2025
0c5acdb
update dummy funtion for Processing columns in parallel
EmilyHongYT Jun 18, 2025
98865e7
Middle version
Isso-W Jun 18, 2025
2886537
Middle version
Isso-W Jun 18, 2025
b75b7d1
Middle version
Isso-W Jun 18, 2025
61f6d39
Update pom.xml
Isso-W Jun 19, 2025
839b050
Add Licenses to new classes
Isso-W Jun 30, 2025
595f4b7
Merge remote-tracking branch 'origin/main'
Isso-W Jun 30, 2025
bfcb133
Update ColumnDecoderMixedMethodsTest.java
Isso-W Jun 30, 2025
0e67c0a
little fix to multi-thread, update time measure methode
Isso-W Jun 30, 2025
91fbc54
Merge remote-tracking branch 'origin/main'
Isso-W Jun 30, 2025
ec7fe78
multi-thread fix, bin optimization
Isso-W Jul 1, 2025
6dfa925
Huge refactor: Changed input structure for decoders. Now working prop…
Isso-W Jul 5, 2025
b8c4f88
update record funtion for Processing columns and test it
EmilyHongYT Jul 9, 2025
d62f358
small fixes on bin
Isso-W Jul 10, 2025
56e7cd9
Merge remote-tracking branch 'origin/main'
Isso-W Jul 10, 2025
d7a3228
update recode & dummy funtion for Processing columns and test it
EmilyHongYT Jul 15, 2025
0a35c28
update Factory & PassThrough part for Processing columns and test it
EmilyHongYT Jul 15, 2025
81c405f
print runtime for dummy and recode when processing single column
EmilyHongYT Jul 15, 2025
91cc343
fixed dummy decoder
Isso-W Jul 16, 2025
8448990
update recode & dummy funtion for Processing columns and test it
EmilyHongYT Jul 15, 2025
dd14314
update Factory & PassThrough part for Processing columns and test it
EmilyHongYT Jul 15, 2025
41cb69b
print runtime for dummy and recode when processing single column
EmilyHongYT Jul 15, 2025
ba3dc04
fixed dummy decoder
Isso-W Jul 16, 2025
bb92e76
little improvement
Isso-W Jul 16, 2025
abafe03
Bug fixes and replace decoder
Isso-W Jul 17, 2025
9b67321
Update pom.xml
Isso-W Jul 21, 2025
64651ec
Merge branch 'apache:main' into main
Isso-W Jul 21, 2025
808cba2
cleaned code and add documentations
Isso-W Jul 28, 2025
0156c58
Merge remote-tracking branch 'origin/main'
Isso-W Jul 28, 2025
337334e
remove internal time counter, add test class using FT Benchmark adult…
Isso-W Aug 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.matrix.operators.SimpleOperator;
import org.apache.sysds.runtime.transform.TfUtils;
import org.apache.sysds.runtime.transform.decode.ColumnDecoder;
import org.apache.sysds.runtime.transform.decode.ColumnDecoderFactory;
import org.apache.sysds.runtime.transform.decode.Decoder;
import org.apache.sysds.runtime.transform.decode.DecoderFactory;
import org.apache.sysds.runtime.transform.encode.EncoderFactory;
Expand Down Expand Up @@ -349,12 +351,19 @@ else if(opcode.equalsIgnoreCase(Opcodes.TRANSFORMDECODE.toString())) {
FrameBlock meta = ec.getFrameInput(params.get("meta"));
String[] colnames = meta.getColumnNames();

// compute transformdecode
Decoder decoder = DecoderFactory
.createDecoder(getParameterMap().get("spec"), colnames, null, meta, data.getNumColumns());
FrameBlock fbout = decoder.decode(data, new FrameBlock(decoder.getSchema()));

ColumnDecoder decoder = ColumnDecoderFactory
.createDecoder(getParameterMap().get("spec"), colnames, null, meta, meta.getNumColumns());
FrameBlock out = new FrameBlock(decoder.getMultiSchema());
FrameBlock fbout = decoder.columnDecode(data, out);
fbout.setColumnNames(Arrays.copyOfRange(colnames, 0, fbout.getNumColumns()));


//Decoder decoder = DecoderFactory
// .createDecoder(getParameterMap().get("spec"), colnames, null, meta, data.getNumColumns());
//FrameBlock fbout = decoder.decode(data, new FrameBlock(decoder.getSchema()));
//fbout.setColumnNames(Arrays.copyOfRange(colnames, 0, fbout.getNumColumns()));

// release locks
ec.setFrameOutput(output.getName(), fbout);
ec.releaseMatrixInput(params.get("target"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@
import org.apache.sysds.runtime.transform.TfUtils.TfMethod;
import org.apache.sysds.runtime.transform.decode.Decoder;
import org.apache.sysds.runtime.transform.decode.DecoderFactory;
import org.apache.sysds.runtime.transform.decode.ColumnDecoder;
import org.apache.sysds.runtime.transform.decode.ColumnDecoderFactory;
import org.apache.sysds.runtime.transform.encode.ColumnEncoderBagOfWords;
import org.apache.sysds.runtime.transform.encode.EncoderFactory;
import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.runtime.transform.decode;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;


import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

/**
* Abstract base class for column-level decoders used in the transform framework.
* Each decoder implements logic to transform encoded columns (e.g., bin, dummy, recode)
* into decoded values during execution (typically from MatrixBlock to FrameBlock).
*
* This class handles metadata fields (e.g., column schema, names, index), provides
* basic serialization logic, and defines abstract decoding methods to be implemented
* by concrete decoders.
*/
public abstract class ColumnDecoder implements Externalizable {
// Logger instance for debugging
protected static final Log LOG = LogFactory.getLog(Decoder.class.getName());
private static final long serialVersionUID = -1732411001366177787L;

// Schema for single-column decoders
protected ValueType _schema;

// Index of the target column (0-based)
protected int _colID;

// For multi-column decoders: value types for all columns
protected ValueType[] _multiSchema;

// Column indices this decoder applies to
protected int[] _colList;

// Column names for metadata tracking (optional)
protected String[] _colnames = null;

// Offset in the input MatrixBlock (0-based)
protected int _offset;

/**
* Constructor for single-column decoders.
*
* @param schema value type of the column
* @param colID column ID (0-based)
* @param offset matrix column index
*/
protected ColumnDecoder(ValueType schema, int colID, int offset) {
_schema = schema;
_colID = colID;
_offset = offset;
}


/**
* Constructor for multi-column decoders.
*
* @param multiSchema value types of all involved columns
* @param colList list of column indices
* @param offset offset in input matrix
*/
protected ColumnDecoder(ValueType[] multiSchema, int[] colList, int offset) {
_multiSchema = multiSchema;
_colList = colList;
_offset = offset;
}

// Basic getter/setter methods for decoder metadata
public int getColOffset() {
return _offset;
}

public ValueType getSchema() {
return _schema;
}

public ValueType[] getMultiSchema() {
return _multiSchema;
}

public int getColID() {
return _colID;
}

public int[] getColList() {return _colList;}

public void setColnames(String[] colnames) {
_colnames = colnames;
}

public String[] getColnames() {
return _colnames;
}

/**
* Block decode API converting a matrix block into a frame block.
*
* @param in Input matrix block
* @param out Output frame block
* @return returns given output frame block for convenience
*/
public abstract FrameBlock columnDecode(MatrixBlock in, FrameBlock out);

/**
* Block decode API converting a matrix block into a frame block in parallel.
*
* @param in Input matrix block
* @param out Output frame block
* @param k Parallelization degree
* @return returns the given output frame block for convenience
*/
public FrameBlock columnDecode(MatrixBlock in, FrameBlock out, int k) {
return columnDecode(in, out);
}

/**
* Block decode row block
*
* @param in input Matrix Block
* @param out output FrameBlock
* @param rl row start to decode
* @param ru row end to decode (not inclusive)
*/
public abstract void columnDecode(MatrixBlock in, FrameBlock out, int rl, int ru);


/**
* Update index-ranges to after decoding. Note that only Dummycoding changes the ranges.
*
* @param beginDims the begin indexes before encoding
* @param endDims the end indexes before encoding
*/
public void updateIndexRanges(long[] beginDims, long[] endDims) {
// do nothing - default
}

public abstract void initMetaData(FrameBlock meta);

/**
* Redirects the default java serialization via externalizable to our default
* hadoop writable serialization for efficient broadcast/rdd serialization.
*
* @param os object output
* @throws IOException if IOException occurs
*/
@Override
public void writeExternal(ObjectOutput os)
throws IOException
{
int size1 = (_colList == null) ? 0 : _colList.length;
os.writeInt(size1);
for(int i = 0; i < size1; i++)
os.writeInt(_colList[i]);

int size2 = (_colnames == null) ? 0 : _colnames.length;
os.writeInt(size2);
for(int j = 0; j < size2; j++)
os.writeUTF(_colnames[j]);

int size3 = (_schema == null) ? 0 : _schema.ordinal();
os.writeInt(size3);
os.writeByte(_schema.ordinal());
}

/**
* Redirects the default java serialization via externalizable to our default
* hadoop writable serialization for efficient broadcast/rdd deserialization.
*
* @param in object input
* @throws IOException if IOException occur
*/
@Override
public void readExternal(ObjectInput in)
throws IOException
{
int size1 = in.readInt();
_colList = (size1 == 0) ? null : new int[size1];
for(int i = 0; i < size1; i++)
_colList[i] = in.readInt();

int size2 = in.readInt();
_colnames = (size2 == 0) ? null : new String[size2];
for(int j = 0; j < size2; j++) {
_colnames[j] = in.readUTF();
}

//int size3 = in.readInt();
//_schema = (size3 == 0) ? null : new ValueType[size3];
//for(int j = 0; j < size3; j++) {
// _schema[j] = ValueType.values()[in.readByte()];
//}
}
}
Loading