Skip to content

[DE-83] Bugfix/stopwords #414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docker/start_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Configuration environment variables:
# STARTER_MODE: (single|cluster|activefailover), default single
# DOCKER_IMAGE: ArangoDB docker image, default gcr.io/gcr-for-testing/arangodb/arangodb:latest
# DOCKER_IMAGE: ArangoDB docker image, default docker.io/arangodb/arangodb:latest
# SSL: (true|false), default false
# DATABASE_EXTENDED_NAMES: (true|false), default false
# ARANGO_LICENSE_KEY: only required for ArangoDB Enterprise
Expand All @@ -11,11 +11,11 @@
# STARTER_MODE=cluster SSL=true ./start_db.sh

STARTER_MODE=${STARTER_MODE:=single}
DOCKER_IMAGE=${DOCKER_IMAGE:=gcr.io/gcr-for-testing/arangodb/arangodb:latest}
DOCKER_IMAGE=${DOCKER_IMAGE:=docker.io/arangodb/arangodb:latest}
SSL=${SSL:=false}
DATABASE_EXTENDED_NAMES=${DATABASE_EXTENDED_NAMES:=false}

STARTER_DOCKER_IMAGE=gcr.io/gcr-for-testing/arangodb/arangodb-starter:latest
STARTER_DOCKER_IMAGE=docker.io/arangodb/arangodb-starter:latest

# exit when any command fails
set -e
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

/**
* @author Michele Rastelli
Expand All @@ -38,26 +39,89 @@ private static String stringToHex(String str) {
return hex.toString();
}

private static String hexToString(String hex) {
final StringBuilder result = new StringBuilder();
for (int i = 0; i < hex.length() - 1; i += 2) {
String tempInHex = hex.substring(i, (i + 2));
int decimal = Integer.parseInt(tempInHex, 16);
result.append((char) decimal);
}
return result.toString();
}

public StopwordsAnalyzerProperties() {
stopwords = new ArrayList<>();
hex = true;
}

private List<String> stopwords;
private final List<String> stopwords;
private final boolean hex;

/**
* @return array of hex-encoded strings that describe the tokens to be discarded.
* @return list of hex-encoded strings that describe the tokens to be discarded.
* @deprecated use {@link #getStopwordsAsHexList()} instead
*/
@Deprecated
public List<String> getStopwords() {
return stopwords;
return getStopwordsAsHexList();
}

/**
* @return list of verbatim strings that describe the tokens to be discarded.
*/
public List<String> getStopwordsAsStringList() {
if (hex) {
return stopwords.stream()
.map(StopwordsAnalyzerProperties::hexToString)
.collect(Collectors.toList());
} else {
return stopwords;
}
}

/**
* @return list of hex-encoded strings that describe the tokens to be discarded.
*/
public List<String> getStopwordsAsHexList() {
if (hex) {
return stopwords;
} else {
return stopwords.stream()
.map(StopwordsAnalyzerProperties::stringToHex)
.collect(Collectors.toList());
}
}

/**
* @return if false each string in {@link #stopwords} is used as verbatim, if true as hex-encoded.
*/
public boolean getHex() {
return hex;
}

/**
* @param value stopword as verbatim string
* @return this
*/
public StopwordsAnalyzerProperties addStopwordAsString(final String value) {
stopwords.add(stringToHex(value));
if (hex) {
stopwords.add(stringToHex(value));
} else {
stopwords.add(value);
}
return this;
}

/**
* @param value stopword as hex string
* @return this
*/
public StopwordsAnalyzerProperties addStopwordAsHex(final String value) {
stopwords.add(value);
if (hex) {
stopwords.add(value);
} else {
stopwords.add(hexToString(value));
}
return this;
}

Expand All @@ -66,11 +130,11 @@ public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
StopwordsAnalyzerProperties that = (StopwordsAnalyzerProperties) o;
return Objects.equals(stopwords, that.stopwords);
return hex == that.hex && Objects.equals(stopwords, that.stopwords);
}

@Override
public int hashCode() {
return Objects.hash(stopwords);
return Objects.hash(stopwords, hex);
}
}
12 changes: 9 additions & 3 deletions src/test/java/com/arangodb/ArangoSearchTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -870,15 +870,21 @@ public void stopwordsAnalyzer() {
.addStopwordAsHex("616e64")
.addStopwordAsString("the");

assertThat(properties.getStopwords(), hasItem("616e64"));
assertThat(properties.getStopwords(), hasItem("746865"));
assertThat(properties.getStopwordsAsStringList(), hasItem("and"));
assertThat(properties.getStopwordsAsHexList(), hasItem("746865"));

StopwordsAnalyzer analyzer = new StopwordsAnalyzer();
analyzer.setName("test-" + UUID.randomUUID().toString());
String name = "test-" + UUID.randomUUID().toString();
analyzer.setName(name);
analyzer.setProperties(properties);
analyzer.setFeatures(features);

createGetAndDeleteTypedAnalyzer(analyzer);
db.createSearchAnalyzer(analyzer);
String res = db.query("RETURN FLATTEN(TOKENS(SPLIT('the fox and the dog and a theater', ' '), @aName))",
Collections.singletonMap("aName", name), String.class).next();
assertThat(res, is("[\"fox\",\"dog\",\"a\",\"theater\"]"));
db.deleteSearchAnalyzer(name);
}

@Test
Expand Down