Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Encoding mappings for pairs of sequence #506

Merged
merged 6 commits into from
Nov 6, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bindings/node/lib/bindings/__mocks__/vocab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@ name
is
jo
##hn
what
yours
pair
[UNK]
46 changes: 36 additions & 10 deletions bindings/node/lib/bindings/raw-encoding.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,79 @@ import { PaddingDirection } from "./enums";
*/
export interface RawEncoding {
/**
* Get the encoded tokens corresponding to the word at the given index in the input
* sequence, with the form [startToken, endToken+1]
* @param word The position of a word in the input sequence
* Get the encoded tokens corresponding to the word at the given index in one of the input
* sequences, with the form [startToken, endToken+1]
* @param word The position of a word in one of the input sequences
* @param seqId The index of the input sequence that contains said word
* @since 0.7.0
*/
wordToTokens(word: number): [number, number] | undefined;
wordToTokens(word: number, seqId?: number): [number, number] | undefined;

/**
* Get the offsets of the word at the given index in the input sequence
* @param word The index of the word in the input sequence
* @param seqId The index of the input sequence that contains said word
* @since 0.7.0
*/
wordToChars(word: number): [number, number] | undefined;
wordToChars(word: number, seqId?: number): [number, number] | undefined;

/**
* Get the index of the sequence that contains the given token
* @param token The index of the token in the encoded sequence
*/
tokenToSequence(token: number): number | undefined;

/**
* Get the offsets of the token at the given index
* If this encoding represents only one sequence, then only the offsets are returned.
* If this encoding represents more than one sequence, then it returns a tuple with the sequence
* id in the first part
* @param token The index of the token in the encoded sequence
* @since 0.7.0
*/
tokenToChars(token: number): [number, number] | undefined;
tokenToChars(token: number): [number, number] | [number, [number, number]] | undefined;

/**
* Get the word that contains the token at the given index
* If this encoding represents only one sequence, then only the offsets are returned.
* If this encoding represents more than one sequence, then it returns a tuple with the sequence
* id in the first part
* @param token The index of the token in the encoded sequence
* @since 0.7.0
*/
tokenToWord(token: number): number | undefined;
tokenToWord(token: number): number | [number, number] | undefined;

/**
* Find the index of the token at the position of the given char
* @param pos The position of a char in the input string
* @param pos The position of a char in one of the input strings
* @param seqId The index of the input sequence that contains said char
* @since 0.6.0
*/
charToToken(pos: number): number | undefined;
charToToken(pos: number, seqId?: number): number | undefined;

/**
* Get the word that contains the given char
* @param pos The position of a char in the input string
* @param seqId The index of the input sequence that contains said char
* @since 0.7.0
*/
charToWord(pos: number): number | undefined;
charToWord(pos: number, seqId?: number): number | undefined;

/**
* Returns the attention mask
*/
getAttentionMask(): number[];

/**
* Returns the number of sequences
*/
getNSequences(): number;

/**
* Set the sequence id for this encoding
*/
setSequenceId(seqId: number): undefined;

/**
* Returns the tokenized ids
*/
Expand Down
44 changes: 42 additions & 2 deletions bindings/node/lib/bindings/raw-encoding.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,23 @@ describe("Can modify pretokenizers on the fly", () => {
tokenizer.setPreTokenizer(sequencePreTokenizer([whitespacePreTokenizer()]));

encoding = await encode(input, null);
expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 6]);
expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 8]);

// Change pre tokenizer
tokenizer.setPreTokenizer(
sequencePreTokenizer([whitespacePreTokenizer(), punctuationPreTokenizer()])
);

encoding = await encode(input, null);
expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 6, 6, 6]);
expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 8, 8, 8]);
});
});

describe("RawEncoding", () => {
const originalString = "my name is john";
const originalPairString = "what is yours?";
let encoding: RawEncoding;
let encodingDual: RawEncoding;
let encode: (
sequence: InputSequence,
pair?: InputSequence | null,
Expand All @@ -74,6 +76,7 @@ describe("RawEncoding", () => {

beforeEach(async () => {
encoding = await encode(originalString, null);
encodingDual = await encode(originalString, originalPairString);
});

it("has a list of defined methods", async () => {
Expand Down Expand Up @@ -115,6 +118,11 @@ describe("RawEncoding", () => {
expect(indexes).toEqual([3, 5]);
});

it("returns the corrent indexes with pair sequences", () => {
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5]);
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9]);
});

it("returns undefined when out of range word", () => {
const index = encoding.wordToTokens(100);
expect(index).toBeUndefined();
Expand All @@ -127,18 +135,35 @@ describe("RawEncoding", () => {
expect(offsets).toEqual([11, 15]);
});

it("returns the correct offsets with pair sequences", () => {
expect(encodingDual.wordToChars(3, 0)).toEqual([11, 15]);
expect(encodingDual.wordToChars(3, 1)).toEqual([13, 14]);
});

it("returns undefined when out of range word", () => {
const offsets = encoding.wordToChars(100);
expect(offsets).toBeUndefined();
});
});

describe("tokenToSequence", () => {
it("returns the correct value", () => {
expect(encodingDual.tokenToSequence(4)).toEqual(0);
expect(encodingDual.tokenToSequence(6)).toEqual(1);
});
});

describe("tokenToChars", () => {
it("returns the correct offsets", () => {
const offsets = encoding.tokenToChars(3);
expect(offsets).toEqual([11, 13]);
});

it("returns the correct offsets with pair sequences", () => {
expect(encodingDual.tokenToChars(3)).toEqual([0, [11, 13]]);
expect(encodingDual.tokenToChars(7)).toEqual([1, [8, 13]]);
});

it("returns undefined when out of range token", () => {
const offsets = encoding.tokenToChars(100);
expect(offsets).toBeUndefined();
Expand All @@ -151,6 +176,11 @@ describe("RawEncoding", () => {
expect(index).toEqual(3);
});

it("returns the correct index with pair sequences", () => {
expect(encodingDual.tokenToWord(3)).toEqual([0, 3]);
expect(encodingDual.tokenToWord(7)).toEqual([1, 2]);
});

it("returns undefined when out of range token", () => {
const index = encoding.tokenToWord(100);
expect(index).toBeUndefined();
Expand All @@ -163,6 +193,11 @@ describe("RawEncoding", () => {
expect(index).toEqual(1);
});

it("returns the correct index with pair sequences", () => {
expect(encodingDual.charToToken(3, 0)).toEqual(1);
expect(encodingDual.charToToken(3, 1)).toEqual(5);
});

it("returns undefined when out of range char", () => {
const index = encoding.charToToken(100);
expect(index).toBeUndefined();
Expand All @@ -175,6 +210,11 @@ describe("RawEncoding", () => {
expect(index).toEqual(1);
});

it("returns the correct index with pair sequences", () => {
expect(encodingDual.charToWord(3, 0)).toEqual(1);
expect(encodingDual.charToWord(3, 1)).toEqual(0);
});

it("returns undefined when out of range char", () => {
const index = encoding.charToWord(100);
expect(index).toBeUndefined();
Expand Down
61 changes: 50 additions & 11 deletions bindings/node/lib/implementations/encoding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ export class Encoding {
return new Encoding(mergedRaw);
}

/**
* Number of sequences
*/
get nSequences(): number {
return this._rawEncoding.getNSequences();
}

setSequenceId(seqId: number) {
return this._rawEncoding.setSequenceId(seqId);
}

/**
* Attention mask
*/
Expand Down Expand Up @@ -141,48 +152,76 @@ export class Encoding {
}

/**
* Get the encoded tokens corresponding to the word at the given index in the input
* sequence, with the form [startToken, endToken+1]
* @param word The position of a word in the input sequence
* Get the encoded tokens corresponding to the word at the given index in one of the input
* sequences, with the form [startToken, endToken+1]
* @param word The position of a word in one of the input sequences
* @param seqId The index of the input sequence that contains said word
* @since 0.7.0
*/
wordToTokens(word: number): [number, number] | undefined {
return this._rawEncoding.wordToTokens(word);
wordToTokens(word: number, seqId?: number): [number, number] | undefined {
return this._rawEncoding.wordToTokens(word, seqId);
}

/**
* Get the offsets of the word at the given index in the input sequence
* @param word The index of the word in the input sequence
* @param seqId The index of the input sequence that contains said word
* @since 0.7.0
*/
wordToChars(word: number): [number, number] | undefined {
return this._rawEncoding.wordToChars(word);
wordToChars(word: number, seqId?: number): [number, number] | undefined {
return this._rawEncoding.wordToChars(word, seqId);
}

/**
* Get the index of the sequence that contains the given token
* @param token The index of the token in the encoded sequence
*/
tokenToSequence(token: number): number | undefined {
return this._rawEncoding.tokenToSequence(token);
}

/**
* Get the offsets of the token at the given index
* If this encoding represents only one sequence, then only the offsets are returned.
* If this encoding represents more than one sequence, then it returns a tuple with the sequence
* id in the first part
* @param token The index of the token in the encoded sequence
* @since 0.7.0
*/
tokenToChars(token: number): [number, number] | undefined {
tokenToChars(token: number): [number, number] | [number, [number, number]] | undefined {
return this._rawEncoding.tokenToChars(token);
}

/**
* Get the word that contains the token at the given index
* If this encoding represents only one sequence, then only the offsets are returned.
* If this encoding represents more than one sequence, then it returns a tuple with the sequence
* id in the first part
* @param token The index of the token in the encoded sequence
* @since 0.7.0
*/
tokenToWord(token: number): number | undefined {
tokenToWord(token: number): number | [number, number] | undefined {
return this._rawEncoding.tokenToWord(token);
}

/**
* Find the index of the token at the position of the given char
* @param pos The position of a char in one of the input strings
* @param seqId The index of the input sequence that contains said char
* @since 0.6.0
*/
charToToken(pos: number, seqId?: number): number | undefined {
return this._rawEncoding.charToToken(pos, seqId);
}

/**
* Get the word that contains the given char
* @param pos The position of a char in the input string
* @param seqId The index of the input sequence that contains said char
* @since 0.7.0
*/
charToToken(pos: number): number | undefined {
return this._rawEncoding.charToToken(pos);
charToWord(pos: number, seqId?: number): number | undefined {
return this._rawEncoding.charToWord(pos, seqId);
}

/**
Expand Down
Loading