-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: squash plugin-nlu's branch commits
- Loading branch information
1 parent
c65fd7b
commit 80b592b
Showing
46 changed files
with
1,018 additions
and
1,877 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import { Command, flags } from '@oclif/command' | ||
import { track, getGlobalNodeModulesPath } from '../utils' | ||
import * as colors from 'colors' | ||
const path = require('path') | ||
|
||
export default class Run extends Command { | ||
static description = 'Serve your bot in your localhost' | ||
|
||
static examples = [ | ||
`$ botonic train | ||
TRAINING MODEL FOR {LANGUAGE}... | ||
` | ||
] | ||
|
||
static flags = { | ||
lang: flags.string() | ||
} | ||
|
||
static args = [] | ||
|
||
async run() { | ||
const { args, flags } = this.parse(Run) | ||
|
||
const botonicNLUPath: string = path.join( | ||
process.cwd(), | ||
'node_modules', | ||
'@botonic', | ||
'nlu' | ||
) | ||
try { | ||
const { BotonicNLU, CONSTANTS } = await import(botonicNLUPath) | ||
process.argv.push(CONSTANTS.LANG_FLAG) | ||
if (flags.lang) { | ||
process.argv.push(flags.lang) | ||
} | ||
track('Trained with Botonic train') | ||
const botonicNLU = new BotonicNLU() | ||
const nluPath = path.join(process.cwd(), 'src', CONSTANTS.NLU_DIRNAME) | ||
await botonicNLU.train({ nluPath }) | ||
} catch (e) { | ||
console.log( | ||
`You don't have @botonic/nlu installed.\nPlease, install it by typing the following command:` | ||
.red | ||
) | ||
console.log(` $ npm install @botonic/nlu`) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"presets": [ | ||
[ | ||
"@babel/preset-env", | ||
{ | ||
"modules": "umd" | ||
} | ||
] | ||
], | ||
"plugins": ["@babel/plugin-transform-runtime"] | ||
} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
{ | ||
"name": "@botonic/nlu", | ||
"version": "0.1.0", | ||
"main": "lib/index", | ||
"scripts": { | ||
"build": "rm -rf lib && babel src -d lib", | ||
"test": "jest" | ||
}, | ||
"jest": { | ||
"testEnvironment": "node" | ||
}, | ||
"dependencies": { | ||
"@babel/runtime": "^7.5.5", | ||
"@tensorflow/tfjs": "^1.2.7", | ||
"@tensorflow/tfjs-node": "^1.2.7", | ||
"axios": "^0.19.0", | ||
"colors": "^1.3.3", | ||
"compromise": "^11.13.2", | ||
"compromise-plugin": "0.0.9", | ||
"franc": "^4.1.0", | ||
"fs": "0.0.1-security", | ||
"inquirer": "^6.3.1", | ||
"sqlite-async": "^1.0.11" | ||
}, | ||
"devDependencies": { | ||
"@babel/cli": "^7.5.5", | ||
"@babel/core": "^7.5.5", | ||
"@babel/plugin-transform-runtime": "^7.5.5", | ||
"@babel/preset-env": "^7.5.5", | ||
"@types/jest": "^24.0.17", | ||
"jest": "^24.8.0" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
import path from 'path' | ||
import { readJSON, readDir } from './file-utils' | ||
import { detectLang, preprocessData } from './preprocessing' | ||
import { getEmbeddingMatrix } from './word-embeddings' | ||
import * as tf from '@tensorflow/tfjs-node' | ||
import { parseLangFlag, printPrettyConfig } from './utils' | ||
import { | ||
UTTERANCES_DIRNAME, | ||
MODELS_DIRNAME, | ||
NLU_DATA_FILENAME, | ||
MODEL_FILENAME | ||
} from './constants' | ||
import { loadDevData, saveDevData } from './file-utils' | ||
import { getPrediction, getIntent } from './prediction' | ||
import { getEntities } from './ner' | ||
|
||
// TODO: interactive command to try intents from terminal | ||
// import inquirer from 'inquirer' | ||
// import { interactiveMode } from './scripts/interactive-mode' | ||
// async function askForInteractiveMode() { | ||
// const questions = [ | ||
// { | ||
// type: 'confirm', | ||
// name: 'affirmative', | ||
// message: `Do you want to switch into interactive mode?` | ||
// } | ||
// ] | ||
// return inquirer.prompt(questions) | ||
// } | ||
|
||
export class BotonicNLU { | ||
constructor() { | ||
this.languages = parseLangFlag(process.argv) | ||
this.nluPath = '' | ||
this.utterancesPath = '' | ||
this.modelsPath = '' | ||
this.devData = {} | ||
this.models = {} | ||
} | ||
|
||
async train({ nluPath }) { | ||
// TODO: Think about passing an arg for using models in memory | ||
this.nluPath = nluPath | ||
this.utterancesPath = path.join(nluPath, UTTERANCES_DIRNAME) | ||
this.modelsPath = path.join(nluPath, MODELS_DIRNAME) | ||
this.devData = loadDevData(this.nluPath, this.languages) | ||
this.languages = Object.keys(this.devData) | ||
for (let language of this.languages) { | ||
let devData = this.devData[language] | ||
let { devIntents, params, devEntities } = devData | ||
params = { ...params, language } // TODO: Think better this reassignment | ||
printPrettyConfig(params) | ||
let start = new Date() | ||
let { | ||
tensorData, | ||
tensorLabels, | ||
vocabulary, | ||
vocabularyLength | ||
} = preprocessData(devIntents, params) | ||
let embeddingMatrix = await getEmbeddingMatrix({ | ||
vocabulary, | ||
vocabularyLength, | ||
params | ||
}) | ||
this.models[language] = embeddingLSTMModel({ | ||
params, | ||
vocabularyLength, | ||
embeddingMatrix: tf.tensor(embeddingMatrix), | ||
outputDim: Object.keys(devIntents.intentsDict).length | ||
}) | ||
this.models[language].summary() | ||
this.models[language].compile({ | ||
optimizer: tf.train.adam(params.LEARNING_RATE), | ||
loss: 'categoricalCrossentropy', | ||
metrics: ['accuracy'] | ||
}) | ||
console.log('TRAINING...') | ||
|
||
const history = await this.models[language].fit( | ||
tensorData, | ||
tensorLabels, | ||
{ | ||
epochs: params.EPOCHS, | ||
validationSplit: params.VALIDATION_SPLIT | ||
} | ||
) | ||
let end = new Date() - start | ||
console.log(`\nTOTAL TRAINING TIME: ${end}ms`) | ||
let nluData = { | ||
maxSeqLength: params.MAX_SEQ_LENGTH, | ||
vocabulary, | ||
intentsDict: devIntents.intentsDict, | ||
language, | ||
devEntities | ||
} | ||
await saveDevData({ | ||
modelsPath: this.modelsPath, | ||
model: this.models[language], | ||
language, | ||
nluData | ||
}) | ||
} | ||
} | ||
|
||
async loadModels({ modelsPath }) { | ||
let models = {} | ||
models.languages = readDir(modelsPath) | ||
for (let language of models.languages) { | ||
models[language] = {} | ||
models[language].nluData = readJSON( | ||
path.join(modelsPath, language, NLU_DATA_FILENAME) | ||
) | ||
models[language].model = await tf.loadLayersModel( | ||
`file://${modelsPath}/${language}/${MODEL_FILENAME}` | ||
) | ||
} | ||
return models | ||
} | ||
predict(models, input) { | ||
let language = detectLang(input, models.languages) | ||
let { model, nluData } = models[language] | ||
let prediction = getPrediction(input, model, nluData) | ||
let intent = getIntent(prediction, nluData.intentsDict, language) | ||
let entities = getEntities(input, nluData.devEntities) | ||
return { intent, entities } | ||
} | ||
// static async interactive({ modelsPath, languages }) { | ||
// let wantsInteractiveMode = await askForInteractiveMode() | ||
// if (wantsInteractiveMode.affirmative) { | ||
// let modelsLanguages = | ||
// parseLangFlag(process.argv) || languages || readDir(modelsPath) | ||
// let nlus = {} | ||
// for (let lang of modelsLanguages) { | ||
// nlus[`${lang}`] = {} | ||
// nlus[`${lang}`].nluData = readJSON( | ||
// path.join(modelsPath, lang, NLU_DATA_FILENAME) | ||
// ) | ||
// nlus[`${lang}`].model = await tf.loadLayersModel( | ||
// `file://${modelsPath}/${lang}/${MODEL_FILENAME}` | ||
// ) | ||
// } | ||
// interactiveMode(nlus) | ||
// } | ||
// } | ||
} | ||
function embeddingLSTMModel({ | ||
vocabularyLength, | ||
embeddingMatrix, | ||
params, | ||
outputDim | ||
}) { | ||
let model = tf.sequential() | ||
model.add( | ||
tf.layers.embedding({ | ||
inputDim: vocabularyLength, | ||
outputDim: params.EMBEDDING_DIM, | ||
inputLength: params.MAX_SEQ_LENGTH, | ||
trainable: params.TRAINABLE_EMBEDDINGS, | ||
weights: [embeddingMatrix] | ||
}) | ||
) | ||
|
||
model.add( | ||
// tf.layers.bidirectional({ | ||
// layer: tf.layers.lstm({ | ||
// units: params.UNITS, | ||
// dropout: params.DROPOUT_REG, | ||
// recurrentDropout: params.DROPOUT_REG | ||
// }) | ||
// }) | ||
tf.layers.lstm({ | ||
units: params.UNITS, | ||
dropout: params.DROPOUT_REG, | ||
recurrentDropout: params.DROPOUT_REG | ||
}) | ||
) | ||
model.add( | ||
tf.layers.dense({ | ||
units: outputDim, | ||
activation: 'softmax' | ||
}) | ||
) | ||
return model | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// Execution | ||
export const LANG_FLAG = '--lang' | ||
// Filenames | ||
export const NLU_DATA_FILENAME = 'nlu-data.json' | ||
export const MODEL_FILENAME = 'model.json' | ||
// Dirnames | ||
export const MODELS_DIRNAME = 'models' | ||
export const UTTERANCES_DIRNAME = 'utterances' | ||
// Subpaths | ||
export const NLU_DIRNAME = 'nlu' | ||
export const NLU_CONFIG_FILENAME = 'nlu.config.json' | ||
export const GLOBAL_CONFIG_DIRNAME = '.botonic' | ||
export const WORD_EMBEDDINGS_DIRNAME = 'word-embeddings' | ||
|
||
// General Config | ||
export const UTTERANCES_EXTENSION = '.txt' | ||
export const ASSETS_DIRNAME = 'assets' | ||
export const UNKNOWN_TOKEN = '<UNK>' | ||
export const DB = { | ||
TABLE: 'embeddings', | ||
COLUMN: 'token' | ||
} | ||
export const WORD_EMBEDDDINGS_ENDPOINT = | ||
'https://s3-eu-west-1.amazonaws.com/word-embeddings.hubtype.com' | ||
|
||
//Entities | ||
export const ENTITIES_REGEX = /\[(.*?)\]\((.*?)\)/ | ||
export const GLOBAL_ENTITIES_REGEX = /\[(.*?)\]\((.*?)\)/g | ||
export const DEFAULT_ENTITIES = [ | ||
// Nouns | ||
'Organization', | ||
'Currency', | ||
'Unit', | ||
// Places | ||
'Country', | ||
'Region', | ||
'Place', | ||
'City', | ||
// Dates | ||
'WeekDay', | ||
'Date', | ||
'Holiday', | ||
'Month', | ||
'Duration', | ||
'Time', | ||
// People | ||
'FirstName', | ||
'LastName', | ||
'MaleName', | ||
'FemaleName', | ||
'Honorific', | ||
'Person' | ||
] |
Oops, something went wrong.