From 23be72c68886fc21051dabb3587c8876f2f68459 Mon Sep 17 00:00:00 2001 From: Onur Sumer Date: Tue, 1 Oct 2024 18:13:37 -0400 Subject: [PATCH] Normalize data counts in a generic case-insensitive way --- .../impl/StudyViewColumnarServiceImpl.java | 60 +++++++++++++++++-- .../StudyViewFilterMapper.xml | 4 +- .../mybatisclickhouse/StudyViewMapper.xml | 16 ----- .../StudyViewMapperClinicalDataCountTest.java | 10 +++- 4 files changed, 63 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java b/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java index 00ddc3c5823..fc9af001d41 100644 --- a/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java +++ b/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java @@ -29,11 +29,8 @@ import org.springframework.cache.annotation.Cacheable; import org.springframework.stereotype.Service; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; +import java.util.*; import java.util.stream.Collectors; -import java.util.Map; @Service public class StudyViewColumnarServiceImpl implements StudyViewColumnarService { @@ -175,15 +172,66 @@ private StudyViewFilterContext createContext(StudyViewFilter studyViewFilter) { } private List generateDataCountItemsFromDataCounts(List dataCounts) { - return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId)) + return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId)) .entrySet().parallelStream().map(e -> { ClinicalDataCountItem item = new ClinicalDataCountItem(); item.setAttributeId(e.getKey()); - item.setCounts(e.getValue()); + item.setCounts(normalizeDataCounts(e.getValue())); return item; }).toList(); } + /** + * Normalizes data counts by merging attribute values in a case-insensitive way. + * For example attribute values "TRUE", "True", and 'true' will be merged into a single aggregated count. + * This method assumes that all the counts in the given dataCounts list has the same attributeId. + * + * @param dataCounts list of data counts for a single attribute + * + * @return normalized list of data counts + */ + private List normalizeDataCounts(List dataCounts) { + Collection normalizedDataCounts = dataCounts + .stream() + .collect( + Collectors.groupingBy( + c -> c.getValue().toLowerCase(), + Collectors.reducing(new ClinicalDataCount(), (count1, count2) -> { + // assuming attribute ids are the same for all data counts, just pick the first one + String attributeId = + count1.getAttributeId() != null + ? count1.getAttributeId() + : count2.getAttributeId(); + + // pick the value in a deterministic way by prioritizing lower case over upper case. + // for example, 'True' will be picked in case of 2 different values like 'TRUE', and 'True', + // and 'true' will be picked in case of 3 different values like 'TRUE', 'True', and 'true' + String value = count1.getValue() != null + ? count1.getValue() + : count2.getValue(); + if (count1.getValue() != null && count2.getValue() != null) { + value = count1.getValue().compareTo(count2.getValue()) > 0 + ? count1.getValue() + : count2.getValue(); + } + + // aggregate counts for the merged values + Integer count = (count1.getCount() != null ? count1.getCount(): 0) + + (count2.getCount() != null ? count2.getCount(): 0); + + ClinicalDataCount aggregated = new ClinicalDataCount(); + aggregated.setAttributeId(attributeId); + aggregated.setValue(value); + aggregated.setCount(count); + return aggregated; + }) + ) + ) + .values(); + + return new ArrayList<>(normalizedDataCounts); + } + public static List calculateMissingNaCountsForClinicalDataCountItems( List clinicalDataCountItems, List filteredAttributes, diff --git a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml index 468731e9bc1..b1e179be264 100644 --- a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml +++ b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml @@ -381,7 +381,7 @@ - ) = '${dataFilterValue.value}' + ) ILIKE '${dataFilterValue.value}' @@ -578,7 +578,7 @@ - ) = '${dataFilterValue.value}' + ) ILIKE '${dataFilterValue.value}' diff --git a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml index ac0d54407e8..1621cb4a646 100644 --- a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml +++ b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml @@ -755,28 +755,12 @@ OR upperUTF8(${attribute_value})='N/A' - - - upperUTF8(${attribute_value})='TRUE' - - - upperUTF8(${attribute_value})='FALSE' - - multiIf( , 'NA', - - - , - 'True', - - - , - 'False', ${attribute_value} ) diff --git a/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapperClinicalDataCountTest.java b/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapperClinicalDataCountTest.java index 8013aa5f6a2..26a5466ace7 100644 --- a/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapperClinicalDataCountTest.java +++ b/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapperClinicalDataCountTest.java @@ -89,9 +89,13 @@ public void getDeadCounts() { Collections.emptyList() ); - assertEquals(6, categoricalClinicalDataCounts.size()); - assertEquals(3, findClinicaDataCount(categoricalClinicalDataCounts, "True")); - assertEquals(4, findClinicaDataCount(categoricalClinicalDataCounts, "False")); + assertEquals(10, categoricalClinicalDataCounts.size()); + assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "True")); + assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "TRUE")); + assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "true")); + assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "False")); + assertEquals(2, findClinicaDataCount(categoricalClinicalDataCounts, "FALSE")); + assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "false")); assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Released")); assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Collected")); assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Unknown"));