Skip to content

Commit

Permalink
[8.x] [inference] NL-to-ESQL: improve doc generation (#192378) (#192802)
Browse files Browse the repository at this point in the history
# Backport

This will backport the following commits from `main` to `8.x`:
- [[inference] NL-to-ESQL: improve doc generation
(#192378)](#192378)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Pierre
Gayvallet","email":"pierre.gayvallet@elastic.co"},"sourceCommit":{"committedDate":"2024-09-13T07:29:29Z","message":"[inference]
NL-to-ESQL: improve doc generation (#192378)\n\n##
Summary\r\n\r\nFollow-up of
https://github.com/elastic/kibana/pull/190433\r\n\r\nFix
[#192762](https://github.com/elastic/kibana/issues/192762)\r\n\r\n-
Cleanup and refactor the documentation generation script\r\n- Make some
tweak to the documentation to improve efficiency and make a\r\nbetter
user of tokens\r\n- Perform human review of the generated content to
make sure everything\r\nis
accurate\r\n\r\n---------\r\n\r\nCo-authored-by: kibanamachine
<42973632+kibanamachine@users.noreply.github.com>","sha":"3226eb691af82882cdc89edd9ddff9abbcac1e5c","branchLabelMapping":{"^v9.0.0$":"main","^v8.16.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:skip","v9.0.0","backport:prev-minor","v8.16.0","Team:AI
Infra"],"title":"[inference] NL-to-ESQL: improve doc
generation","number":192378,"url":"https://github.com/elastic/kibana/pull/192378","mergeCommit":{"message":"[inference]
NL-to-ESQL: improve doc generation (#192378)\n\n##
Summary\r\n\r\nFollow-up of
https://github.com/elastic/kibana/pull/190433\r\n\r\nFix
[#192762](https://github.com/elastic/kibana/issues/192762)\r\n\r\n-
Cleanup and refactor the documentation generation script\r\n- Make some
tweak to the documentation to improve efficiency and make a\r\nbetter
user of tokens\r\n- Perform human review of the generated content to
make sure everything\r\nis
accurate\r\n\r\n---------\r\n\r\nCo-authored-by: kibanamachine
<42973632+kibanamachine@users.noreply.github.com>","sha":"3226eb691af82882cdc89edd9ddff9abbcac1e5c"}},"sourceBranch":"main","suggestedTargetBranches":["8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.0.0","branchLabelMappingKey":"^v9.0.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/192378","number":192378,"mergeCommit":{"message":"[inference]
NL-to-ESQL: improve doc generation (#192378)\n\n##
Summary\r\n\r\nFollow-up of
https://github.com/elastic/kibana/pull/190433\r\n\r\nFix
[#192762](https://github.com/elastic/kibana/issues/192762)\r\n\r\n-
Cleanup and refactor the documentation generation script\r\n- Make some
tweak to the documentation to improve efficiency and make a\r\nbetter
user of tokens\r\n- Perform human review of the generated content to
make sure everything\r\nis
accurate\r\n\r\n---------\r\n\r\nCo-authored-by: kibanamachine
<42973632+kibanamachine@users.noreply.github.com>","sha":"3226eb691af82882cdc89edd9ddff9abbcac1e5c"}},{"branch":"8.x","label":"v8.16.0","branchLabelMappingKey":"^v8.16.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->

Co-authored-by: Pierre Gayvallet <pierre.gayvallet@elastic.co>
  • Loading branch information
kibanamachine and pgayvallet authored Sep 13, 2024
1 parent 7c265ec commit 8936983
Show file tree
Hide file tree
Showing 133 changed files with 3,560 additions and 1,493 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ const buildTestDefinitions = (): Section[] => {
{
title: 'Generates a query to show employees filtered by name and grouped by hire_date',
question: `From the employees index, I want to see how many employees with a "B" in their first name
where hired each month over the past 2 years.
were hired each month over the past 2 years.
Assume the following fields:
- hire_date
- first_name
Expand All @@ -208,10 +208,10 @@ const buildTestDefinitions = (): Section[] => {
(which can be read the same backward and forward), and then return their last name and first name
- last_name
- first_name`,
expected: `FROM employees
| EVAL reversed_last_name = REVERSE(last_name)
| WHERE TO_LOWER(last_name) == TO_LOWER(reversed_last_name)
| KEEP last_name, first_name`,
criteria: [
`The assistant should not provide an ES|QL query, and explicitly mention that there is no
way to check for palindromes using ES|QL.`,
],
},
{
title: 'Generates a query to show the top 10 domains by doc count',
Expand Down
288 changes: 288 additions & 0 deletions x-pack/plugins/inference/scripts/load_esql_docs/extract_doc_entries.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import Fs from 'fs/promises';
import Path from 'path';
import fastGlob from 'fast-glob';
import $, { load, Cheerio, AnyNode } from 'cheerio';
import { partition } from 'lodash';
import { ToolingLog } from '@kbn/tooling-log';
import pLimit from 'p-limit';
import { ScriptInferenceClient } from '../util/kibana_client';
import { convertToMarkdownPrompt } from './prompts/convert_to_markdown';
import { bindOutput, PromptCaller } from './utils/output_executor';

/**
* The pages that will be extracted but only used as context
* for the LLM for the enhancement tasks of the documentation entries.
*/
const contextArticles = [
'esql.html',
'esql-syntax.html',
'esql-kibana.html',
'esql-query-api.html',
'esql-limitations.html',
'esql-cross-clusters.html',
'esql-examples.html',
'esql-metadata-fields.html',
'esql-multi-index.html',
];

interface ExtractedPage {
sourceFile: string;
name: string;
content: string;
}

export interface ExtractedCommandOrFunc {
name: string;
markdownContent: string;
command: boolean;
}

export interface ExtractionOutput {
commands: ExtractedCommandOrFunc[];
functions: ExtractedCommandOrFunc[];
pages: ExtractedPage[];
skippedFile: string[];
}

export async function extractDocEntries({
builtDocsDir,
log,
inferenceClient,
}: {
builtDocsDir: string;
log: ToolingLog;
inferenceClient: ScriptInferenceClient;
}): Promise<ExtractionOutput> {
const files = await fastGlob(`${builtDocsDir}/html/en/elasticsearch/reference/master/esql*.html`);
if (!files.length) {
throw new Error('No files found');
}

const output: ExtractionOutput = {
commands: [],
functions: [],
pages: [],
skippedFile: [],
};

const executePrompt = bindOutput({
output: inferenceClient.output,
connectorId: inferenceClient.getConnectorId(),
});

const limiter = pLimit(10);

await Promise.all(
files.map(async (file) => {
return await processFile({
file,
log,
executePrompt,
output,
limiter,
});
})
);

return output;
}

async function processFile({
file: fileFullPath,
output,
executePrompt,
log,
limiter,
}: {
file: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const basename = Path.basename(fileFullPath);
const fileContent = (await Fs.readFile(fileFullPath)).toString('utf-8');

if (basename === 'esql-commands.html') {
// process commands
await processCommands({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (basename === 'esql-functions-operators.html') {
// process functions / operators
await processFunctionsAndOperators({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (contextArticles.includes(basename)) {
const $element = load(fileContent)('*');
output.pages.push({
sourceFile: basename,
name: basename === 'esql.html' ? 'overview' : basename.substring(5, basename.length - 5),
content: getSimpleText($element),
});
} else {
output.skippedFile.push(basename);
}
}

async function processFunctionsAndOperators({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element);

const searches = [
'Binary operators',
'Equality',
'Inequality',
'Less than',
'Less than or equal to',
'Greater than',
'Greater than or equal to',
'Add +',
'Subtract -',
'Multiply *',
'Divide /',
'Modulus %',
'Unary operators',
'Logical operators',
'IS NULL and IS NOT NULL',
'Cast (::)',
];

const matches = ['IN', 'LIKE', 'RLIKE'];

const [operatorSections, allOtherSections] = partition(sections, (section) => {
return (
matches.includes(section.title) ||
searches.some((search) => section.title.toLowerCase().startsWith(search.toLowerCase()))
);
});

const functionSections = allOtherSections.filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
functionSections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: false,
};
});
})
);

output.functions.push(...markdownFiles);

output.pages.push({
sourceFile: 'esql-functions-operators.html',
name: 'operators',
content: operatorSections.map(({ title, content }) => `${title}\n${content}`).join('\n'),
});
}

async function processCommands({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element).filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
sections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: true,
};
});
})
);

output.commands.push(...markdownFiles);
}

function getSimpleText($element: Cheerio<AnyNode>) {
$element.remove('.navfooter');
$element.remove('#sticky_content');
$element.remove('.edit_me');
$element.find('code').each(function () {
$(this).replaceWith('`' + $(this).text() + '`');
});
return $element
.find('.section,section,.part')
.last()
.text()
.replaceAll(/([\n]\s*){2,}/g, '\n');
}

export function extractSections(cheerio: Cheerio<AnyNode>) {
const sections: Array<{
title: string;
content: string;
}> = [];
cheerio.find('.section .position-relative').each((index, element) => {
const untilNextHeader = $(element).nextUntil('.position-relative');

const title = $(element).text().trim().replace('edit', '');

untilNextHeader.find('svg defs').remove();
untilNextHeader.find('.console_code_copy').remove();
untilNextHeader.find('.imageblock').remove();
untilNextHeader.find('table').remove();

const htmlContent = untilNextHeader
.map((i, node) => $(node).prop('outerHTML'))
.toArray()
.join('');

sections.push({
title: title === 'STATS ... BY' ? 'STATS' : title,
content: `<div><h1>${title}</h1> ${htmlContent}</div>`,
});
});

return sections;
}

This file was deleted.

Loading

0 comments on commit 8936983

Please sign in to comment.