feat: adds support for Japanese language (#653)

oramasearch · Mar 3, 2024 · ef6106a · ef6106a
1 parent 33c5dde
commit ef6106a
Show file tree

Hide file tree

Showing 23 changed files with 1,186 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ the
 - [Typo Tolerance](https://docs.oramasearch.com/open-source/usage/search/introduction#typo-tolerance)
 - [Exact Match](https://docs.oramasearch.com/open-source/usage/search/introduction#exact-match)
 - [BM25](https://docs.oramasearch.com/open-source/usage/search/bm25-algorithm)
-- [Stemming and tokenization in 29 languages](https://docs.oramasearch.com/open-source/text-analysis/stemming)
+- [Stemming and tokenization in 30 languages](https://docs.oramasearch.com/open-source/text-analysis/stemming)
 - [Plugin System](https://docs.oramasearch.com/open-source/plugins/introduction)
 
 # Installation

diff --git a/packages/docs/.vitepress/config.mts b/packages/docs/.vitepress/config.mts
@@ -95,6 +95,10 @@ export default defineConfig({
                 {
                   text: 'Using Chinese with Orama',
                   link: '/open-source/supported-languages/using-chinese-with-orama.html'
+                },
+                {
+                  text: 'Using Japanese with Orama',
+                  link: '/open-source/supported-languages/using-japanese-with-orama.html'
                 }
               ]
             },

diff --git a/packages/docs/cloud/understanding-orama/introduction.md b/packages/docs/cloud/understanding-orama/introduction.md
@@ -15,7 +15,7 @@ Its numerous features include:
 - Full-text search
 - Vector search
 - Hybrid search
-- Stemming and tokenizing in 29 languages
+- Stemming and tokenizing in 30 languages
 - Geosearch
 - Query filters
 - Facets

diff --git a/packages/docs/open-source/supported-languages/index.md b/packages/docs/open-source/supported-languages/index.md
@@ -4,13 +4,15 @@ outline: deep
 
 # Officially Supported Languages
 
-Right now, Orama supports 29 languages out of the box in 8 different alphabets. \
+Right now, Orama supports 30 languages out of the box in 8 different alphabets. \
 For every language, Orama provides a default tokenizer, stop-words, and stemmer.
 
-::: tip 🇨🇳 A note on Chinese
-At the time of writing, Chinese (Mandarin) is the only exception, since Orama provides everything by default but the stemmer.
+::: tip 🇨🇳🇯🇵 A note on Chinese and Japanese
+At the time of writing, Chinese (Mandarin) and Japanese are the only exception, since Orama provides everything by default but the stemmer.
 
-Since Chinese logograms follow different rules than other alphabets, you will need to import a dedicated tokenizer for it. [Read more here](/open-source/supported-languages/using-chinese-with-orama.html).
+Since Chinese and Japanese logograms follow different rules than other alphabets, you will need to import a dedicated tokenizer for it.
+
+Read more here about Chinese [here](/open-source/supported-languages/using-chinese-with-orama.html) and about Japanese [here](/open-source/supported-languages/using-japanese-with-orama.html).
 :::
 
 ### Latin Alphabet
@@ -86,4 +88,5 @@ Since Chinese logograms follow different rules than other alphabets, you will ne
 
 | Language               | Tokenizer | Stop-words | Stemmer |
 | ---------------------- | --------- | ---------- | --------|
-| Chinese (Mandarin)     | ✅        | ✅          | ❌      |
+| Chinese (Mandarin)     | ✅        | ✅          | ❌      |
+| Japanese               | ✅        | ✅          | ❌      |
diff --git a/packages/docs/open-source/supported-languages/using-japanese-with-orama.md b/packages/docs/open-source/supported-languages/using-japanese-with-orama.md
@@ -0,0 +1,81 @@
+# Using Japanese with Orama
+
+At the time of writing, Orama supports Japanese via a custom tokenizer, which is part of the `@orama/tokenizers` package.
+
+:::warning
+The Japanese tokenizer is a compiled WASM from the [lindera](https://github.com/lindera-morphology/lindera) Rust project.
+It can be quite large and its usage on the browser is discouraged.
+:::
+
+To get started, make sure to install all the dependencies you need:
+
+```sh
+npm i @orama/orama @orama/tokenizers
+```
+
+If you want to add Japanese stop-words as well, install the `@orama/stopwords` package too:
+
+```sh
+npm i @orama/stopwords
+```
+
+Now you're ready to get started with Orama:
+
+```js
+import { create, insert, search } from '@orama/orama'
+import { createTokenizer } from '@orama/tokenizers'
+import { stopwords as japaneseStopwords } from '@orama/stopwords/japanese'
+
+const db = await create({
+  schema: {
+    name: 'string'
+  },
+  components: {
+    tokenizer: await createTokenizer({
+      stopWords: japaneseStopwords
+    })
+  }
+})
+
+await insert(db, { name: '東京' }) // Tokyo
+await insert(db, { name: '大阪' }) // Osaka
+await insert(db, { name: '京都' }) // Kyoto
+await insert(db, { name: '横浜' }) // Yokohama
+await insert(db, { name: '札幌' }) // Sapporo
+await insert(db, { name: '仙台' }) // Sendai
+await insert(db, { name: '広島' }) // Hiroshima
+await insert(db, { name: '東京大学' }) // University of Tokyo
+await insert(db, { name: '京都大学' }) // Kyoto University
+await insert(db, { name: '大阪大学' }) // Osaka University
+
+const results = await search(db, {
+  term: '大阪',
+  threshold: 0
+})
+
+console.log(results)
+
+// {
+//   "elapsed": {
+//     "raw": 89554625,
+//     "formatted": "89ms"
+//   },
+//   "hits": [
+//     {
+//       "id": "36666208-3",
+//       "score": 4.210224897276653,
+//       "document": {
+//         "name": "大阪"
+//       }
+//     },
+//     {
+//       "id": "36666208-10",
+//       "score": 1.9335268122510698,
+//       "document": {
+//         "name": "大阪大学"
+//       }
+//     }
+//   ],
+//   "count": 2
+// }
+```
diff --git a/packages/docs/open-source/text-analysis/stemming.md b/packages/docs/open-source/text-analysis/stemming.md
@@ -38,7 +38,7 @@ const db = create({
 })
 ```
 
-Right now, Orama supports 29 languages and stemmers out of the box:
+Right now, Orama supports 30 languages and stemmers out of the box:
 
 - Arabic
 - Armenian

diff --git a/packages/docs/open-source/text-analysis/stopwords.md b/packages/docs/open-source/text-analysis/stopwords.md
@@ -86,7 +86,7 @@ const db = await create({
 
 ## Supported languages
 
-As for now, Orama supports 29 languages when it comes to stop-words removal:
+As for now, Orama supports 30 languages when it comes to stop-words removal:
 
 - Arabic
 - Armenian

diff --git a/packages/orama/README.md b/packages/orama/README.md
@@ -24,7 +24,7 @@ the
 - [Typo Tolerance](https://docs.oramasearch.com/open-source/usage/search/introduction#typo-tolerance)
 - [Exact Match](https://docs.oramasearch.com/open-source/usage/search/introduction#exact-match)
 - [BM25](https://docs.oramasearch.com/open-source/usage/search/bm25-algorithm)
-- [Stemming and tokenization in 29 languages](https://docs.oramasearch.com/open-source/text-analysis/stemming)
+- [Stemming and tokenization in 30 languages](https://docs.oramasearch.com/open-source/text-analysis/stemming)
 - [Plugin System](https://docs.oramasearch.com/open-source/plugins/introduction)
 
 # Installation

diff --git a/packages/stemmers/README.md b/packages/stemmers/README.md
@@ -2,7 +2,7 @@
 
 Orama can analyze the input and perform a `stemming` operation, which allows the engine to perform more optimized queries, as well as save indexing space.
 
-Right now, Orama supports 29 languages and stemmers out of the box:
+Right now, Orama supports 30 languages and stemmers out of the box:
 
 - Arabic
 - Armenian

diff --git a/packages/stopwords/README.md b/packages/stopwords/README.md
@@ -1,6 +1,6 @@
 # Orama Stop-words
 
-This package provides support for stop-words removal in 29 languages:
+This package provides support for stop-words removal in 30 languages:
 
 - Arabic
 - Armenian

diff --git a/packages/stopwords/lib/ja.js b/packages/stopwords/lib/ja.js
diff --git a/packages/stopwords/package.json b/packages/stopwords/package.json
@@ -82,6 +82,11 @@
       "import": "./dist/it.js",
       "require": "./dist/it.cjs"
     },
+    "./japanese": {
+      "types": "./dist/ja.d.ts",
+      "import": "./dist/ja.js",
+      "require": "./dist/ja.cjs"
+    },
     "./lithuanian": {
       "types": "./dist/lt.d.ts",
       "import": "./dist/lt.js",

diff --git a/packages/stopwords/scripts/build.js b/packages/stopwords/scripts/build.js
@@ -22,6 +22,7 @@ const stemmers = {
   indonesian: 'id',
   irish: 'ie',
   italian: 'it',
+  japanese: 'ja',
   lithuanian: 'lt',
   mandarin: 'zh',
   nepali: 'np',

diff --git a/packages/tokenizers/README.md b/packages/tokenizers/README.md
@@ -3,8 +3,8 @@
 This package provides support for additional tokenizers for the Orama Search Engine.
 
 Available tokenizers:
-- Chinese (Mandarin - stemmer not supported)
-- Japanese (work in progress)
+- Chinese (Mandarin, experimental)
+- Japanese (experimental)
 - Korean (work in progress)
 
 Usage:

diff --git a/packages/tokenizers/package.json b/packages/tokenizers/package.json
@@ -5,6 +5,11 @@
   "description": "Additional tokenizers for Orama",
   "sideEffects": false,
   "exports": {
+    "./japanese": {
+      "types": "./build/tokenizer-japanese/tokenizer.d.ts",
+      "import": "./build/tokenizer-japanese/tokenizer.mjs",
+      "require": "./build/tokenizer-japanese/tokenizer.js"
+    },
     "./mandarin": {
       "types": "./build/tokenizer-mandarin/tokenizer.d.ts",
       "import": "./build/tokenizer-mandarin/tokenizer.mjs",
@@ -26,7 +31,7 @@
   },
   "scripts": {
     "build": "node ./scripts/build.mjs",
-    "test": "npm run build && node ./tests/mandarin.test.js"
+    "test": "node ./tests/japanese.test.js && node ./tests/japanese.test.js"
   },
   "keywords": [
     "full-text search",

diff --git a/packages/tokenizers/scripts/build.mjs b/packages/tokenizers/scripts/build.mjs
@@ -3,47 +3,53 @@ import fs from 'node:fs'
 import childProcess from 'node:child_process'
 
 const isWasmPackInstalled = await checkWasmPackInstalled()
-
-if (!isWasmPackInstalled) {
-  console.warn('!! WARNING')
-  console.warn('!! Compilation of the Mandarin tokenizer requires wasm-pack to be installed.')
-  console.warn('!! No wasm-pack installation found. Skipping build.')
-  process.exit(0)
-}
 
-const outdirBaseURL = new URL('../build', import.meta.url).pathname
-const tokenizersBaseURL = new URL('../src', import.meta.url).pathname
+const languages = ['mandarin', 'japanese']
 
-const mandarinTokenizerPath = path.join(tokenizersBaseURL, 'tokenizer-mandarin')
-const mandarinTokenizerWasmPath = path.join(mandarinTokenizerPath, 'pkg')
-const mandarinTokenizerDistPath = path.join(tokenizersBaseURL, '../build/tokenizer-mandarin')
-const mandarinTokenizerWrapperPath = path.join(tokenizersBaseURL, 'tokenizer-mandarin/src/tokenizer.ts')
-const mandarinTokenizerWrapperDistPath = path.join(mandarinTokenizerDistPath, 'tokenizer.ts')
+const outdirBaseURL = new URL('../build', import.meta.url).pathname
 
 if (fs.existsSync(outdirBaseURL)) {
   fs.rmdirSync(outdirBaseURL, { recursive: true })
 }
 
 fs.mkdirSync(outdirBaseURL)
 
-childProcess.execSync(`cd ${mandarinTokenizerPath} && wasm-pack build --target web`)
-
-fs.cpSync(mandarinTokenizerWrapperPath, mandarinTokenizerWrapperDistPath, {
-  recursive: true
-})
+for (const language of languages) {
 
-fs.cpSync(mandarinTokenizerWasmPath, mandarinTokenizerDistPath, {
-  recursive: true
-})
+  if (!isWasmPackInstalled) {
+    console.warn('!! WARNING')
+    console.warn(`!! Compilation of the **${language}** tokenizer requires wasm-pack to be installed.`)
+    console.warn('!! No wasm-pack installation found. Skipping build.')
+    process.exit(0)
+  }
 
-fs.rmSync(path.join(mandarinTokenizerDistPath, '.gitignore'))
-
-const r = fs.readFileSync('./build/tokenizer-mandarin/tokenizer_mandarin_bg.wasm')
-const b = new Uint8Array(r)
-const rr = `export const wasm = new Uint8Array([${b.join(',')}]);`
-fs.writeFileSync('./build/tokenizer-mandarin/tokenizer_mandarin_bg_wasm_arr.js', rr)
-
-childProcess.execSync(`cd ${mandarinTokenizerDistPath} && npx tsup --format cjs,esm,iife --outDir . tokenizer.ts`)
+  const tokenizersBaseURL = new URL('../src', import.meta.url).pathname
+
+  const tokenizerPath = path.join(tokenizersBaseURL, `tokenizer-${language}`)
+  const tokenizerWasmPath = path.join(tokenizerPath, 'pkg')
+  const tokenizerDistPath = path.join(tokenizersBaseURL, `../build/tokenizer-${language}`)
+  const tokenizerWrapperPath = path.join(tokenizersBaseURL, `tokenizer-${language}/src/tokenizer.ts`)
+  const tokenizerWrapperDistPath = path.join(tokenizerDistPath, 'tokenizer.ts')
+
+  childProcess.execSync(`cd ${tokenizerPath} && wasm-pack build --target web`)
+
+  fs.cpSync(tokenizerWrapperPath, tokenizerWrapperDistPath, {
+    recursive: true
+  })
+
+  fs.cpSync(tokenizerWasmPath, tokenizerDistPath, {
+    recursive: true
+  })
+
+  fs.rmSync(path.join(tokenizerDistPath, '.gitignore'))
+
+  const r = fs.readFileSync(`./build/tokenizer-${language}/tokenizer_${language}_bg.wasm`)
+  const b = new Uint8Array(r)
+  const rr = `export const wasm = new Uint8Array([${b.join(',')}]);`
+  fs.writeFileSync(`./build/tokenizer-${language}/tokenizer_${language}_bg_wasm_arr.js`, rr)
+
+  childProcess.execSync(`cd ${tokenizerDistPath} && npx tsup --format cjs,esm,iife --outDir . tokenizer.ts`)
+}
 
 async function checkWasmPackInstalled() {
   return new Promise((resolve) => {

diff --git a/packages/tokenizers/src/tokenizer-japanese/.gitignore b/packages/tokenizers/src/tokenizer-japanese/.gitignore
@@ -0,0 +1,2 @@
+target
+pkg
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,7 +38,7 @@ const db = create({ @@
     })
     ```
-    Right now, Orama supports 29 languages and stemmers out of the box:
+    Right now, Orama supports 30 languages and stemmers out of the box:
     - Arabic
     - Armenian
@@ Expand Down @@