Skip to content

Commit

Permalink
Merge pull request #134 from kermitt2/bugfix/lexicon-97
Browse files Browse the repository at this point in the history
Fix derivation items generated when loading the lexicon #97
  • Loading branch information
lfoppiano authored May 23, 2022
2 parents d224075 + 9ea29f3 commit e4d3ee1
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 42 deletions.
25 changes: 5 additions & 20 deletions src/main/java/org/grobid/core/lexicon/QuantityLexicon.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class QuantityLexicon {
public static final String UNITS_FILENAME = "units.json";
public static final String UNITS_EN_PATH = "lexicon/en/" + UNITS_FILENAME;

private static final String COMPOSED_UNIT_REGEX = "[^/*]";
private static final String COMPOSED_UNIT_REGEX = "[^/*]+";
private static final String COMPOSED_UNIT_REGEX_WITH_DELIMITER = String.format("((?<=%1$s)|(?=%1$s))", "[/*]{1}");

private Pattern composedUnitPattern = Pattern.compile(COMPOSED_UNIT_REGEX);
Expand Down Expand Up @@ -91,7 +91,7 @@ private void init() {
unitPattern = new FastMatcher();

prefixes = loadPrefixes(this.getClass().getClassLoader().getResourceAsStream(PREFIX_EN_PATH));
readJsonFile(this.getClass().getClassLoader().getResourceAsStream(UNITS_EN_PATH), "units", l -> processJsonNode(l));
readJsonFile(this.getClass().getClassLoader().getResourceAsStream(UNITS_EN_PATH), "units", this::processJsonNode);

numberTokens = WordsToNumber.getInstance().getTokenSet();
}
Expand All @@ -110,11 +110,11 @@ private void processJsonNode(JsonNode node) {
unitDefinition.setSupportsPrefixes(supportsPrefixes.asBoolean());
} else {
// Imperial and us customary won't need prefixes
if (Arrays.asList(UnitUtilities.System_Type.IMPERIAL, UnitUtilities.System_Type.US)
if (Arrays.asList(UnitUtilities.System_Type.SI_BASE, UnitUtilities.System_Type.SI_DERIVED)
.contains(unitDefinition.getSystem())) {
unitDefinition.setSupportsPrefixes(false);
} else {
unitDefinition.setSupportsPrefixes(true);
} else {
unitDefinition.setSupportsPrefixes(false);
}
}

Expand Down Expand Up @@ -404,21 +404,6 @@ public List<RegexValueHolder> decomposeComplexUnit(String unitTerm) {
return decomposition;
}

@SuppressWarnings("deprecated")
@Deprecated
public static List<RegexValueHolder> decomposeComplexUnitWithDelimiter(String unitTerm) {
List<RegexValueHolder> decomposition = new ArrayList<>();
String[] splits = unitTerm.split(COMPOSED_UNIT_REGEX_WITH_DELIMITER);

int i = 0;
for (String split : splits) {
decomposition.add(new RegexValueHolder(split, i, i = i + split.length()));
// i += split.length();
}

return decomposition;
}

/**
* Soft look-up in unit dictionary
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
package org.grobid.core.utilities;

import com.google.common.collect.Iterables;
import net.sf.saxon.lib.Logger;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.assertj.core.data.Offset;
import org.grobid.core.data.Measurement;
import org.grobid.core.data.Quantity;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;

import java.util.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

import static org.apache.commons.collections4.CollectionUtils.isEmpty;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,43 +180,51 @@ public void testInPrefixDictionary_G() throws Exception {
assertThat(target.inPrefixDictionary("G"), is(true));
}

@Test
public void testDecomposeComplexUnitWithDelimiterMultiplication() throws Exception {
List<RegexValueHolder> out = target.decomposeComplexUnit("m*s2");

assertThat(out.size(), is(2));

assertThat(out.get(0).getValue(), is("m"));
assertThat(out.get(0).getStart(), is(0));
assertThat(out.get(0).getEnd(), is(1));
assertThat(out.get(1).getValue(), is("s2"));
assertThat(out.get(1).getStart(), is(2));
assertThat(out.get(1).getEnd(), is(4));
}

@Test
public void testDecomposeComplexUnitWithDelimiter() throws Exception {
List<RegexValueHolder> out = target.decomposeComplexUnitWithDelimiter("m/s2");
List<RegexValueHolder> out = target.decomposeComplexUnit("m/s2");

assertThat(out.size(), is(3));
assertThat(out.size(), is(2));

assertThat(out.get(0).getValue(), is("m"));
assertThat(out.get(0).getStart(), is(0));
assertThat(out.get(0).getEnd(), is(1));
assertThat(out.get(1).getValue(), is("/"));
assertThat(out.get(1).getStart(), is(1));
assertThat(out.get(1).getEnd(), is(2));
assertThat(out.get(2).getValue(), is("s2"));
assertThat(out.get(2).getStart(), is(2));
assertThat(out.get(2).getEnd(), is(4));
assertThat(out.get(1).getValue(), is("s2"));
assertThat(out.get(1).getStart(), is(2));
assertThat(out.get(1).getEnd(), is(4));
}

@Test
public void testDecomposeComplexUnitWithDelimiter2() throws Exception {
List<RegexValueHolder> out = target.decomposeComplexUnitWithDelimiter("mol/m^3");
List<RegexValueHolder> out = target.decomposeComplexUnit("mol/m^3");

assertThat(out.size(), is(3));
assertThat(out.size(), is(2));

assertThat(out.get(0).getValue(), is("mol"));
assertThat(out.get(0).getStart(), is(0));
assertThat(out.get(0).getEnd(), is(3));
assertThat(out.get(1).getValue(), is("/"));
assertThat(out.get(1).getStart(), is(3));
assertThat(out.get(1).getEnd(), is(4));
assertThat(out.get(2).getValue(), is("m^3"));
assertThat(out.get(2).getStart(), is(4));
assertThat(out.get(2).getEnd(), is(7));
assertThat(out.get(1).getValue(), is("m^3"));
assertThat(out.get(1).getStart(), is(4));
assertThat(out.get(1).getEnd(), is(7));
}

@Test
public void testDecomposeComplexUnitWithDelimiter3() throws Exception {
List<RegexValueHolder> out = target.decomposeComplexUnitWithDelimiter("cm⁻¹");
List<RegexValueHolder> out = target.decomposeComplexUnit("cm⁻¹");

assertThat(out.size(), is(1));

Expand Down

0 comments on commit e4d3ee1

Please sign in to comment.