Skip to content

Commit

Permalink
[inference] NL-to-ESQL: improve doc generation (elastic#192378)
Browse files Browse the repository at this point in the history
## Summary

Follow-up of elastic#190433

Fix [elastic#192762](elastic#192762)

- Cleanup and refactor the documentation generation script
- Make some tweak to the documentation to improve efficiency and make a
better user of tokens
- Perform human review of the generated content to make sure everything
is accurate

---------

Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
pgayvallet and kibanamachine authored Sep 13, 2024
1 parent f2f5096 commit 3226eb6
Show file tree
Hide file tree
Showing 133 changed files with 3,560 additions and 1,493 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ const buildTestDefinitions = (): Section[] => {
{
title: 'Generates a query to show employees filtered by name and grouped by hire_date',
question: `From the employees index, I want to see how many employees with a "B" in their first name
where hired each month over the past 2 years.
were hired each month over the past 2 years.
Assume the following fields:
- hire_date
- first_name
Expand All @@ -208,10 +208,10 @@ const buildTestDefinitions = (): Section[] => {
(which can be read the same backward and forward), and then return their last name and first name
- last_name
- first_name`,
expected: `FROM employees
| EVAL reversed_last_name = REVERSE(last_name)
| WHERE TO_LOWER(last_name) == TO_LOWER(reversed_last_name)
| KEEP last_name, first_name`,
criteria: [
`The assistant should not provide an ES|QL query, and explicitly mention that there is no
way to check for palindromes using ES|QL.`,
],
},
{
title: 'Generates a query to show the top 10 domains by doc count',
Expand Down
288 changes: 288 additions & 0 deletions x-pack/plugins/inference/scripts/load_esql_docs/extract_doc_entries.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import Fs from 'fs/promises';
import Path from 'path';
import fastGlob from 'fast-glob';
import $, { load, Cheerio, AnyNode } from 'cheerio';
import { partition } from 'lodash';
import { ToolingLog } from '@kbn/tooling-log';
import pLimit from 'p-limit';
import { ScriptInferenceClient } from '../util/kibana_client';
import { convertToMarkdownPrompt } from './prompts/convert_to_markdown';
import { bindOutput, PromptCaller } from './utils/output_executor';

/**
* The pages that will be extracted but only used as context
* for the LLM for the enhancement tasks of the documentation entries.
*/
const contextArticles = [
'esql.html',
'esql-syntax.html',
'esql-kibana.html',
'esql-query-api.html',
'esql-limitations.html',
'esql-cross-clusters.html',
'esql-examples.html',
'esql-metadata-fields.html',
'esql-multi-index.html',
];

interface ExtractedPage {
sourceFile: string;
name: string;
content: string;
}

export interface ExtractedCommandOrFunc {
name: string;
markdownContent: string;
command: boolean;
}

export interface ExtractionOutput {
commands: ExtractedCommandOrFunc[];
functions: ExtractedCommandOrFunc[];
pages: ExtractedPage[];
skippedFile: string[];
}

export async function extractDocEntries({
builtDocsDir,
log,
inferenceClient,
}: {
builtDocsDir: string;
log: ToolingLog;
inferenceClient: ScriptInferenceClient;
}): Promise<ExtractionOutput> {
const files = await fastGlob(`${builtDocsDir}/html/en/elasticsearch/reference/master/esql*.html`);
if (!files.length) {
throw new Error('No files found');
}

const output: ExtractionOutput = {
commands: [],
functions: [],
pages: [],
skippedFile: [],
};

const executePrompt = bindOutput({
output: inferenceClient.output,
connectorId: inferenceClient.getConnectorId(),
});

const limiter = pLimit(10);

await Promise.all(
files.map(async (file) => {
return await processFile({
file,
log,
executePrompt,
output,
limiter,
});
})
);

return output;
}

async function processFile({
file: fileFullPath,
output,
executePrompt,
log,
limiter,
}: {
file: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const basename = Path.basename(fileFullPath);
const fileContent = (await Fs.readFile(fileFullPath)).toString('utf-8');

if (basename === 'esql-commands.html') {
// process commands
await processCommands({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (basename === 'esql-functions-operators.html') {
// process functions / operators
await processFunctionsAndOperators({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (contextArticles.includes(basename)) {
const $element = load(fileContent)('*');
output.pages.push({
sourceFile: basename,
name: basename === 'esql.html' ? 'overview' : basename.substring(5, basename.length - 5),
content: getSimpleText($element),
});
} else {
output.skippedFile.push(basename);
}
}

async function processFunctionsAndOperators({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element);

const searches = [
'Binary operators',
'Equality',
'Inequality',
'Less than',
'Less than or equal to',
'Greater than',
'Greater than or equal to',
'Add +',
'Subtract -',
'Multiply *',
'Divide /',
'Modulus %',
'Unary operators',
'Logical operators',
'IS NULL and IS NOT NULL',
'Cast (::)',
];

const matches = ['IN', 'LIKE', 'RLIKE'];

const [operatorSections, allOtherSections] = partition(sections, (section) => {
return (
matches.includes(section.title) ||
searches.some((search) => section.title.toLowerCase().startsWith(search.toLowerCase()))
);
});

const functionSections = allOtherSections.filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
functionSections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: false,
};
});
})
);

output.functions.push(...markdownFiles);

output.pages.push({
sourceFile: 'esql-functions-operators.html',
name: 'operators',
content: operatorSections.map(({ title, content }) => `${title}\n${content}`).join('\n'),
});
}

async function processCommands({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element).filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
sections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: true,
};
});
})
);

output.commands.push(...markdownFiles);
}

function getSimpleText($element: Cheerio<AnyNode>) {
$element.remove('.navfooter');
$element.remove('#sticky_content');
$element.remove('.edit_me');
$element.find('code').each(function () {
$(this).replaceWith('`' + $(this).text() + '`');
});
return $element
.find('.section,section,.part')
.last()
.text()
.replaceAll(/([\n]\s*){2,}/g, '\n');
}

export function extractSections(cheerio: Cheerio<AnyNode>) {
const sections: Array<{
title: string;
content: string;
}> = [];
cheerio.find('.section .position-relative').each((index, element) => {
const untilNextHeader = $(element).nextUntil('.position-relative');

const title = $(element).text().trim().replace('edit', '');

untilNextHeader.find('svg defs').remove();
untilNextHeader.find('.console_code_copy').remove();
untilNextHeader.find('.imageblock').remove();
untilNextHeader.find('table').remove();

const htmlContent = untilNextHeader
.map((i, node) => $(node).prop('outerHTML'))
.toArray()
.join('');

sections.push({
title: title === 'STATS ... BY' ? 'STATS' : title,
content: `<div><h1>${title}</h1> ${htmlContent}</div>`,
});
});

return sections;
}

This file was deleted.

Loading

0 comments on commit 3226eb6

Please sign in to comment.