Skip to content

Commit

Permalink
Support custom token chars in (edge)ngram tokenizer
Browse files Browse the repository at this point in the history
Relates: #4341

This commit adds support for custom token characters for
edgengram and ngram tokenizers.
  • Loading branch information
russcam committed Feb 11, 2020
1 parent 6b3f816 commit b14cfdc
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 10 deletions.
30 changes: 25 additions & 5 deletions src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ public interface IEdgeNGramTokenizer : ITokenizer
/// </summary>
[DataMember(Name ="token_chars")]
IEnumerable<TokenChar> TokenChars { get; set; }

/// <summary>
/// Custom characters that should be treated as part of a token. For example,
/// setting this to +-_ will make the tokenizer treat the plus, minus and
/// underscore sign as part of a token.
/// <para />
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
/// <para />
/// Available in Elasticsearch 7.6.0+.
/// </summary>
[DataMember(Name = "custom_token_chars")]
string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -44,6 +56,9 @@ public class EdgeNGramTokenizer : TokenizerBase, IEdgeNGramTokenizer

/// <inheritdoc />
public IEnumerable<TokenChar> TokenChars { get; set; }

/// <inheritdoc />
public string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -52,22 +67,27 @@ public class EdgeNGramTokenizerDescriptor
{
protected override string Type => "edge_ngram";
int? IEdgeNGramTokenizer.MaxGram { get; set; }

int? IEdgeNGramTokenizer.MinGram { get; set; }
IEnumerable<TokenChar> IEdgeNGramTokenizer.TokenChars { get; set; }

/// <inheritdoc />
string IEdgeNGramTokenizer.CustomTokenChars { get; set; }

/// <inheritdoc cref="IEdgeNGramTokenizer.MinGram" />
public EdgeNGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.MaxGram" />
public EdgeNGramTokenizerDescriptor MaxGram(int? maxGram) => Assign(maxGram, (a, v) => a.MaxGram = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
public EdgeNGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
public EdgeNGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc cref="IEdgeNGramTokenizer.CustomTokenChars" />
public EdgeNGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
}
}
30 changes: 25 additions & 5 deletions src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ public interface INGramTokenizer : ITokenizer
/// </summary>
[DataMember(Name ="token_chars")]
IEnumerable<TokenChar> TokenChars { get; set; }

/// <summary>
/// Custom characters that should be treated as part of a token. For example,
/// setting this to +-_ will make the tokenizer treat the plus, minus and
/// underscore sign as part of a token.
/// <para />
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
/// <para />
/// Available in Elasticsearch 7.6.0+.
/// </summary>
[DataMember(Name = "custom_token_chars")]
string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -44,6 +56,9 @@ public class NGramTokenizer : TokenizerBase, INGramTokenizer

/// <inheritdoc />
public IEnumerable<TokenChar> TokenChars { get; set; }

/// <inheritdoc />
public string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -52,21 +67,26 @@ public class NGramTokenizerDescriptor
{
protected override string Type => "ngram";
int? INGramTokenizer.MaxGram { get; set; }

int? INGramTokenizer.MinGram { get; set; }
IEnumerable<TokenChar> INGramTokenizer.TokenChars { get; set; }

/// <inheritdoc />
string INGramTokenizer.CustomTokenChars { get; set; }

/// <inheritdoc cref="INGramTokenizer.MinGram" />
public NGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.MaxGram" />
public NGramTokenizerDescriptor MaxGram(int? minGram) => Assign(minGram, (a, v) => a.MaxGram = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
public NGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
public NGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc cref="INGramTokenizer.CustomTokenChars" />
public NGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
}
}
8 changes: 8 additions & 0 deletions src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,13 @@ public enum TokenChar

[EnumMember(Value = "symbol")]
Symbol,

/// <summary>
/// Custom token characters.
/// <para></para>
/// Available in Elasticsearch 7.6.0+
/// </summary>
[EnumMember(Value = "custom")]
Custom,
}
}
60 changes: 60 additions & 0 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,36 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>
public override string Name => "endgen";
}

[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCustomTokenCharsTests>
{
public override FuncTokenizer Fluent => (n, t) => t.EdgeNGram(n, e => e
.MaxGram(2)
.MinGram(1)
.TokenChars(TokenChar.Custom)
.CustomTokenChars("+-_")
);

public override ITokenizer Initializer => new EdgeNGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "custom" },
custom_token_chars = "+-_",
type = "edge_ngram"
};

public override string Name => "endgen_custom";
}

public class NGramTests : TokenizerAssertionBase<NGramTests>
{
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
Expand All @@ -60,6 +90,36 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>
public override string Name => "ng";
}

[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomTokenCharsTests>
{
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
.MaxGram(2)
.MinGram(1)
.TokenChars(TokenChar.Custom)
.CustomTokenChars("+-_")
);

public override ITokenizer Initializer => new NGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "custom" },
custom_token_chars = "+-_",
type = "ngram"
};

public override string Name => "ngram_custom";
}

public class PathHierarchyTests : TokenizerAssertionBase<PathHierarchyTests>
{
public override FuncTokenizer Fluent => (n, t) => t.PathHierarchy(n, e => e
Expand Down

0 comments on commit b14cfdc

Please sign in to comment.