huggingface · n1t0 · Nov 6, 2020 · Nov 3, 2020 · Nov 4, 2020 · Nov 4, 2020
diff --git a/bindings/node/lib/bindings/__mocks__/vocab.txt b/bindings/node/lib/bindings/__mocks__/vocab.txt
@@ -3,5 +3,7 @@ name
 is
 jo
 ##hn
+what
+yours
 pair
 [UNK]
diff --git a/bindings/node/lib/bindings/raw-encoding.d.ts b/bindings/node/lib/bindings/raw-encoding.d.ts
@@ -5,53 +5,79 @@ import { PaddingDirection } from "./enums";
  */
 export interface RawEncoding {
   /**
-   * Get the encoded tokens corresponding to the word at the given index in the input
-   * sequence, with the form [startToken, endToken+1]
-   * @param word The position of a word in the input sequence
+   * Get the encoded tokens corresponding to the word at the given index in one of the input
+   * sequences, with the form [startToken, endToken+1]
+   * @param word The position of a word in one of the input sequences
+   * @param seqId The index of the input sequence that contains said word
    * @since 0.7.0
    */
-  wordToTokens(word: number): [number, number] | undefined;
+  wordToTokens(word: number, seqId?: number): [number, number] | undefined;
 
   /**
    * Get the offsets of the word at the given index in the input sequence
    * @param word The index of the word in the input sequence
+   * @param seqId The index of the input sequence that contains said word
    * @since 0.7.0
    */
-  wordToChars(word: number): [number, number] | undefined;
+  wordToChars(word: number, seqId?: number): [number, number] | undefined;
+
+  /**
+   * Get the index of the sequence that contains the given token
+   * @param token The index of the token in the encoded sequence
+   */
+  tokenToSequence(token: number): number | undefined;
 
   /**
    * Get the offsets of the token at the given index
+   * If this encoding represents only one sequence, then only the offsets are returned.
+   * If this encoding represents more than one sequence, then it returns a tuple with the sequence
+   *   id in the first part
    * @param token The index of the token in the encoded sequence
    * @since 0.7.0
    */
-  tokenToChars(token: number): [number, number] | undefined;
+  tokenToChars(token: number): [number, number] | [number, [number, number]] | undefined;
 
   /**
    * Get the word that contains the token at the given index
+   * If this encoding represents only one sequence, then only the offsets are returned.
+   * If this encoding represents more than one sequence, then it returns a tuple with the sequence
+   *   id in the first part
    * @param token The index of the token  in the encoded sequence
    * @since 0.7.0
    */
-  tokenToWord(token: number): number | undefined;
+  tokenToWord(token: number): number | [number, number] | undefined;
 
   /**
    * Find the index of the token at the position of the given char
-   * @param pos The position of a char in the input string
+   * @param pos The position of a char in one of the input strings
+   * @param seqId The index of the input sequence that contains said char
    * @since 0.6.0
    */
-  charToToken(pos: number): number | undefined;
+  charToToken(pos: number, seqId?: number): number | undefined;
 
   /**
    * Get the word that contains the given char
    * @param pos The position of a char in the input string
+   * @param seqId The index of the input sequence that contains said char
    * @since 0.7.0
    */
-  charToWord(pos: number): number | undefined;
+  charToWord(pos: number, seqId?: number): number | undefined;
 
   /**
    * Returns the attention mask
    */
   getAttentionMask(): number[];
 
+  /**
+   * Returns the number of sequences
+   */
+  getNSequences(): number;
+
+  /**
+   * Set the sequence id for this encoding
+   */
+  setSequenceId(seqId: number): undefined;
+
   /**
    * Returns the tokenized ids
    */

diff --git a/bindings/node/lib/bindings/raw-encoding.test.ts b/bindings/node/lib/bindings/raw-encoding.test.ts
@@ -38,21 +38,23 @@ describe("Can modify pretokenizers on the fly", () => {
     tokenizer.setPreTokenizer(sequencePreTokenizer([whitespacePreTokenizer()]));
 
     encoding = await encode(input, null);
-    expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 6]);
+    expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 8]);
 
     // Change pre tokenizer
     tokenizer.setPreTokenizer(
       sequencePreTokenizer([whitespacePreTokenizer(), punctuationPreTokenizer()])
     );
 
     encoding = await encode(input, null);
-    expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 6, 6, 6]);
+    expect(encoding.getIds()).toEqual([0, 1, 2, 3, 4, 8, 8, 8]);
   });
 });
 
 describe("RawEncoding", () => {
   const originalString = "my name is john";
+  const originalPairString = "what is yours?";
   let encoding: RawEncoding;
+  let encodingDual: RawEncoding;
   let encode: (
     sequence: InputSequence,
     pair?: InputSequence | null,
@@ -74,6 +76,7 @@ describe("RawEncoding", () => {
 
   beforeEach(async () => {
     encoding = await encode(originalString, null);
+    encodingDual = await encode(originalString, originalPairString);
   });
 
   it("has a list of defined methods", async () => {
@@ -115,6 +118,11 @@ describe("RawEncoding", () => {
       expect(indexes).toEqual([3, 5]);
     });
 
+    it("returns the corrent indexes with pair sequences", () => {
+      expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5]);
+      expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9]);
+    });
+
     it("returns undefined when out of range word", () => {
       const index = encoding.wordToTokens(100);
       expect(index).toBeUndefined();
@@ -127,18 +135,35 @@ describe("RawEncoding", () => {
       expect(offsets).toEqual([11, 15]);
     });
 
+    it("returns the correct offsets with pair sequences", () => {
+      expect(encodingDual.wordToChars(3, 0)).toEqual([11, 15]);
+      expect(encodingDual.wordToChars(3, 1)).toEqual([13, 14]);
+    });
+
     it("returns undefined when out of range word", () => {
       const offsets = encoding.wordToChars(100);
       expect(offsets).toBeUndefined();
     });
   });
 
+  describe("tokenToSequence", () => {
+    it("returns the correct value", () => {
+      expect(encodingDual.tokenToSequence(4)).toEqual(0);
+      expect(encodingDual.tokenToSequence(6)).toEqual(1);
+    });
+  });
+
   describe("tokenToChars", () => {
     it("returns the correct offsets", () => {
       const offsets = encoding.tokenToChars(3);
       expect(offsets).toEqual([11, 13]);
     });
 
+    it("returns the correct offsets with pair sequences", () => {
+      expect(encodingDual.tokenToChars(3)).toEqual([0, [11, 13]]);
+      expect(encodingDual.tokenToChars(7)).toEqual([1, [8, 13]]);
+    });
+
     it("returns undefined when out of range token", () => {
       const offsets = encoding.tokenToChars(100);
       expect(offsets).toBeUndefined();
@@ -151,6 +176,11 @@ describe("RawEncoding", () => {
       expect(index).toEqual(3);
     });
 
+    it("returns the correct index with pair sequences", () => {
+      expect(encodingDual.tokenToWord(3)).toEqual([0, 3]);
+      expect(encodingDual.tokenToWord(7)).toEqual([1, 2]);
+    });
+
     it("returns undefined when out of range token", () => {
       const index = encoding.tokenToWord(100);
       expect(index).toBeUndefined();
@@ -163,6 +193,11 @@ describe("RawEncoding", () => {
       expect(index).toEqual(1);
     });
 
+    it("returns the correct index with pair sequences", () => {
+      expect(encodingDual.charToToken(3, 0)).toEqual(1);
+      expect(encodingDual.charToToken(3, 1)).toEqual(5);
+    });
+
     it("returns undefined when out of range char", () => {
       const index = encoding.charToToken(100);
       expect(index).toBeUndefined();
@@ -175,6 +210,11 @@ describe("RawEncoding", () => {
       expect(index).toEqual(1);
     });
 
+    it("returns the correct index with pair sequences", () => {
+      expect(encodingDual.charToWord(3, 0)).toEqual(1);
+      expect(encodingDual.charToWord(3, 1)).toEqual(0);
+    });
+
     it("returns undefined when out of range char", () => {
       const index = encoding.charToWord(100);
       expect(index).toBeUndefined();

diff --git a/bindings/node/lib/implementations/encoding.ts b/bindings/node/lib/implementations/encoding.ts
@@ -28,6 +28,17 @@ export class Encoding {
     return new Encoding(mergedRaw);
   }
 
+  /**
+   * Number of sequences
+   */
+  get nSequences(): number {
+    return this._rawEncoding.getNSequences();
+  }
+
+  setSequenceId(seqId: number) {
+    return this._rawEncoding.setSequenceId(seqId);
+  }
+
   /**
    * Attention mask
    */
@@ -141,48 +152,76 @@ export class Encoding {
   }
 
   /**
-   * Get the encoded tokens corresponding to the word at the given index in the input
-   * sequence, with the form [startToken, endToken+1]
-   * @param word The position of a word in the input sequence
+   * Get the encoded tokens corresponding to the word at the given index in one of the input
+   * sequences, with the form [startToken, endToken+1]
+   * @param word The position of a word in one of the input sequences
+   * @param seqId The index of the input sequence that contains said word
    * @since 0.7.0
    */
-  wordToTokens(word: number): [number, number] | undefined {
-    return this._rawEncoding.wordToTokens(word);
+  wordToTokens(word: number, seqId?: number): [number, number] | undefined {
+    return this._rawEncoding.wordToTokens(word, seqId);
   }
 
   /**
    * Get the offsets of the word at the given index in the input sequence
    * @param word The index of the word in the input sequence
+   * @param seqId The index of the input sequence that contains said word
    * @since 0.7.0
    */
-  wordToChars(word: number): [number, number] | undefined {
-    return this._rawEncoding.wordToChars(word);
+  wordToChars(word: number, seqId?: number): [number, number] | undefined {
+    return this._rawEncoding.wordToChars(word, seqId);
+  }
+
+  /**
+   * Get the index of the sequence that contains the given token
+   * @param token The index of the token in the encoded sequence
+   */
+  tokenToSequence(token: number): number | undefined {
+    return this._rawEncoding.tokenToSequence(token);
   }
 
   /**
    * Get the offsets of the token at the given index
+   * If this encoding represents only one sequence, then only the offsets are returned.
+   * If this encoding represents more than one sequence, then it returns a tuple with the sequence
+   *   id in the first part
    * @param token The index of the token in the encoded sequence
    * @since 0.7.0
    */
-  tokenToChars(token: number): [number, number] | undefined {
+  tokenToChars(token: number): [number, number] | [number, [number, number]] | undefined {
     return this._rawEncoding.tokenToChars(token);
   }
 
   /**
    * Get the word that contains the token at the given index
+   * If this encoding represents only one sequence, then only the offsets are returned.
+   * If this encoding represents more than one sequence, then it returns a tuple with the sequence
+   *   id in the first part
    * @param token The index of the token  in the encoded sequence
    * @since 0.7.0
    */
-  tokenToWord(token: number): number | undefined {
+  tokenToWord(token: number): number | [number, number] | undefined {
     return this._rawEncoding.tokenToWord(token);
   }
 
   /**
    * Find the index of the token at the position of the given char
+   * @param pos The position of a char in one of the input strings
+   * @param seqId The index of the input sequence that contains said char
+   * @since 0.6.0
+   */
+  charToToken(pos: number, seqId?: number): number | undefined {
+    return this._rawEncoding.charToToken(pos, seqId);
+  }
+
+  /**
+   * Get the word that contains the given char
    * @param pos The position of a char in the input string
+   * @param seqId The index of the input sequence that contains said char
+   * @since 0.7.0
    */
-  charToToken(pos: number): number | undefined {
-    return this._rawEncoding.charToToken(pos);
+  charToWord(pos: number, seqId?: number): number | undefined {
+    return this._rawEncoding.charToWord(pos, seqId);
   }
 
   /**