Skip to content

Commit 84840ec

Browse files
committed
fix: remove duplicated cell when interleave filter is applied
It seems Bigtable classic client & veneer client behaves differently for duplicated `cells`. A Row can contain duplicated cell when user applies `Interleave` filters. Behavior of both of these clients are: - Classic client: Here, we check if a `cell` contains some `labels` or not, If it does then we include that cell in end result, if it doesn't then we compare `timestamp` and `qualifier` with previous no label cell(Because a cell with label could have been produced by applying filters). - Veneer client: Here, we do not performs these checks.(Most of other client allows duplicate cells confirmed on `Go/NodeJs/C#` bigtable client. Not sure why but `python-bigtable` does not allows duplicate cells) **Assumption:** The labels are applied to determine which filters produced those cells, So we are including all cells where labels are present. chore: added javadoc to inform user about this bug - Added class JavaDoc and code comment for future reference. - reset the `previousNoLabelCell` in reset(). - added unit test to verify dedupe logic.
1 parent a2dbe80 commit 84840ec

File tree

2 files changed

+92
-1
lines changed

2 files changed

+92
-1
lines changed

bigtable-client-core-parent/bigtable-hbase/src/main/java/com/google/cloud/bigtable/hbase/wrappers/veneer/RowResultAdapter.java

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
package com.google.cloud.bigtable.hbase.wrappers.veneer;
1717

1818
import com.google.api.core.InternalApi;
19+
import com.google.bigtable.v2.RowFilter.Interleave;
20+
import com.google.cloud.bigtable.data.v2.models.Filters.InterleaveFilter;
1921
import com.google.cloud.bigtable.data.v2.models.RowAdapter;
2022
import com.google.cloud.bigtable.hbase.adapters.read.RowCell;
2123
import com.google.cloud.bigtable.hbase.util.ByteStringer;
@@ -25,16 +27,26 @@
2527
import com.google.common.collect.ImmutableList;
2628
import com.google.protobuf.ByteString;
2729
import java.util.ArrayList;
30+
import java.util.Arrays;
2831
import java.util.List;
2932
import java.util.Map;
3033
import java.util.Objects;
3134
import java.util.TreeMap;
3235
import org.apache.hadoop.hbase.Cell;
3336
import org.apache.hadoop.hbase.client.Result;
37+
import org.apache.hadoop.hbase.filter.FilterList.Operator;
3438

3539
/**
3640
* Adapter for {@link RowAdapter} that uses {@link Result} to represent logical rows.
3741
*
42+
* <p>This adapter is responsible for cell deduplication. bigtable-hbase will convert a {@link
43+
* Operator#MUST_PASS_ONE} filter into an {@link Interleave}. Unfortunately there is a bit of a
44+
* mismatch between the 2 filters: MUST_PASS_ONE will not duplicate the cell if it matches multiple
45+
* branches of the MUST_PASS_ONE, but Interleave will.This adapter will pave over the difference by
46+
* removing the duplicate cells while building the Result. However, HBase's WhileMatchFilter depends
47+
* on duplicate labelled cells for its implementation. So this adapter will not deduplicate labelled
48+
* cells.
49+
*
3850
* <p>For internal use only - public for technical reasons.
3951
*/
4052
@InternalApi("For internal usage only")
@@ -97,6 +109,7 @@ static class RowResultBuilder implements RowBuilder<Result> {
97109
private List<String> labels;
98110
private long timestamp;
99111
private byte[] value;
112+
private RowCell previousNoLabelCell;
100113

101114
private Map<String, List<RowCell>> cells = new TreeMap<>();
102115
private List<RowCell> currentFamilyCells = null;
@@ -159,6 +172,9 @@ public void cellValue(ByteString newValue) {
159172
* </ul>
160173
*
161174
* A flattened version of the {@link RowCell} map will be sorted correctly.
175+
*
176+
* <p>Applying {@link InterleaveFilter} may result in a row to contain duplicated cells, where
177+
* duplicates are grouped in sequences.
162178
*/
163179
@Override
164180
public void finishCell() {
@@ -169,6 +185,7 @@ public void finishCell() {
169185
previousFamily = this.family;
170186
currentFamilyCells = new ArrayList<>();
171187
cells.put(this.family, this.currentFamilyCells);
188+
previousNoLabelCell = null;
172189
}
173190

174191
RowCell rowCell =
@@ -179,7 +196,20 @@ public void finishCell() {
179196
TimestampConverter.bigtable2hbase(this.timestamp),
180197
this.value,
181198
this.labels);
182-
this.currentFamilyCells.add(rowCell);
199+
200+
// dedupe user visible cells. Please see class javadoc for details
201+
if (!this.labels.isEmpty()) {
202+
this.currentFamilyCells.add(rowCell);
203+
} else if (!keysMatch(previousNoLabelCell, rowCell)) {
204+
this.currentFamilyCells.add(rowCell);
205+
this.previousNoLabelCell = rowCell;
206+
}
207+
}
208+
209+
private static boolean keysMatch(RowCell previousNoLabelCell, RowCell current) {
210+
return previousNoLabelCell != null
211+
&& previousNoLabelCell.getTimestamp() == current.getTimestamp()
212+
&& Arrays.equals(previousNoLabelCell.getQualifierArray(), current.getQualifierArray());
183213
}
184214

185215
/**
@@ -210,6 +240,7 @@ public void reset() {
210240
this.cells = new TreeMap<>();
211241
this.currentFamilyCells = null;
212242
this.previousFamily = null;
243+
this.previousNoLabelCell = null;
213244
}
214245

215246
@Override

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/wrappers/veneer/TestRowResultAdapter.java

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,66 @@ public void testOnlyValueIsDifferent() {
154154
assertResult(expected, actual);
155155
}
156156

157+
@Test
158+
public void testDeduplicationLogic() {
159+
List<String> EMPTY_LABEL = ImmutableList.of();
160+
ByteString valueForNewQual = ByteString.copyFromUtf8("value for new qualifier");
161+
ByteString valueForCellWithoutLabel = ByteString.copyFromUtf8("value for Cell without label");
162+
ByteString valueForCellWithLabel = ByteString.copyFromUtf8("value for cell with labels");
163+
RowAdapter.RowBuilder<Result> rowBuilder = underTest.createRowBuilder();
164+
rowBuilder.startRow(ROW_KEY);
165+
166+
// new qualifier
167+
rowBuilder.startCell(COL_FAMILY, QUAL_ONE, 20000L, EMPTY_LABEL, valueForNewQual.size());
168+
rowBuilder.cellValue(valueForNewQual);
169+
rowBuilder.finishCell();
170+
171+
// same qualifier, same timestamp, without label
172+
rowBuilder.startCell(COL_FAMILY, QUAL_ONE, 20000L, EMPTY_LABEL, -1);
173+
rowBuilder.cellValue(valueForCellWithoutLabel);
174+
rowBuilder.finishCell();
175+
176+
// same qualifier, same timestamp, with label
177+
rowBuilder.startCell(COL_FAMILY, QUAL_ONE, 20000L, LABELS, -1);
178+
rowBuilder.cellValue(valueForCellWithLabel);
179+
rowBuilder.finishCell();
180+
181+
// same qualifier, different timestamp
182+
rowBuilder.startCell(COL_FAMILY, QUAL_ONE, 30000L, EMPTY_LABEL, VALUE.size());
183+
rowBuilder.cellValue(VALUE);
184+
rowBuilder.finishCell();
185+
186+
Result actual = rowBuilder.finishRow();
187+
assertEquals(3, actual.size());
188+
189+
RowResult expected =
190+
RowResult.create(
191+
ROW_KEY,
192+
ImmutableList.<Cell>of(
193+
new RowCell(
194+
ROW_KEY.toByteArray(),
195+
COL_FAMILY.getBytes(),
196+
QUAL_ONE.toByteArray(),
197+
20L,
198+
valueForNewQual.toByteArray(),
199+
EMPTY_LABEL),
200+
new RowCell(
201+
ROW_KEY.toByteArray(),
202+
COL_FAMILY.getBytes(),
203+
QUAL_ONE.toByteArray(),
204+
20L,
205+
valueForCellWithLabel.toByteArray(),
206+
LABELS),
207+
new RowCell(
208+
ROW_KEY.toByteArray(),
209+
COL_FAMILY.getBytes(),
210+
QUAL_ONE.toByteArray(),
211+
30L,
212+
VALUE.toByteArray(),
213+
EMPTY_LABEL)));
214+
assertResult(expected, actual);
215+
}
216+
157217
@Test
158218
public void testFamilyOrdering() {
159219
RowAdapter.RowBuilder<Result> rowBuilder = underTest.createRowBuilder();

0 commit comments

Comments
 (0)