Merge branch '8.15' into update-bundled-packages-20240917131222

elastic · Sep 18, 2024 · b9a9653 · b9a9653
2 parents 2f90ef1 + 89e8c02
commit b9a9653
Show file tree

Hide file tree

Showing 32 changed files with 1,188 additions and 40 deletions.
diff --git a/x-pack/plugins/integration_assistant/__jest__/fixtures/unstructured.ts b/x-pack/plugins/integration_assistant/__jest__/fixtures/unstructured.ts
@@ -0,0 +1,25 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const unstructuredLogState = {
+ lastExecutedChain: 'testchain',
+ packageName: 'testPackage',
+ dataStreamName: 'testDatastream',
+ grokPatterns: ['%{GREEDYDATA:message}'],
+ logSamples: ['dummy data'],
+ jsonSamples: ['{"message":"dummy data"}'],
+ finalized: false,
+ ecsVersion: 'testVersion',
+ errors: { test: 'testerror' },
+ additionalProcessors: [],
+};
+
+export const unstructuredLogResponse = {
+ grok_patterns: [
+ '####<%{MONTH} %{MONTHDAY}, %{YEAR} %{TIME} (?:AM|PM) %{WORD:timezone}> <%{WORD:log_level}> <%{WORD:component}> <%{DATA:hostname}> <%{DATA:server_name}> <%{DATA:thread_info}> <%{DATA:user}> <%{DATA:empty_field}> <%{DATA:empty_field2}> <%{NUMBER:timestamp}> <%{DATA:message_id}> <%{GREEDYDATA:message}>',
+ ],
+};
diff --git a/x-pack/plugins/integration_assistant/server/graphs/kv/error.ts b/x-pack/plugins/integration_assistant/server/graphs/kv/error.ts
@@ -33,7 +33,7 @@ export async function handleKVError({
 
  return {
  kvProcessor,
- lastExecutedChain: 'kv_error',
+ lastExecutedChain: 'kvError',
  };
 }
 

diff --git a/x-pack/plugins/integration_assistant/server/graphs/kv/header.test.ts b/x-pack/plugins/integration_assistant/server/graphs/kv/header.test.ts
@@ -43,6 +43,6 @@ describe('Testing kv header', () => {
  expect(response.grokPattern).toStrictEqual(
  '<%{NUMBER:priority}>%{NUMBER:version} %{GREEDYDATA:message}'
  );
- expect(response.lastExecutedChain).toBe('kv_header');
+ expect(response.lastExecutedChain).toBe('kvHeader');
  });
 });
diff --git a/x-pack/plugins/integration_assistant/server/graphs/kv/header.ts b/x-pack/plugins/integration_assistant/server/graphs/kv/header.ts
@@ -26,6 +26,6 @@ export async function handleHeader({
 
  return {
  grokPattern: pattern.grok_pattern,
- lastExecutedChain: 'kv_header',
+ lastExecutedChain: 'kvHeader',
  };
 }
diff --git a/x-pack/plugins/integration_assistant/server/graphs/kv/prompts.ts b/x-pack/plugins/integration_assistant/server/graphs/kv/prompts.ts
@@ -68,15 +68,19 @@ export const KV_HEADER_PROMPT = ChatPromptTemplate.fromMessages([
  ],
  [
  'human',
- `Looking at the multiple syslog samples provided in the context, our goal is to identify which RFC they belog to. Then create a regex pattern that can separate the header and the structured body.
+ `Looking at the multiple syslog samples provided in the context, your task is to separate the "header" and the "message body" from this log. Our goal is to identify which RFC they belong to. Then create a regex pattern that can separate the header and the structured body.
 You then have to create a grok pattern using the regex pattern.
+You are given a log entry in a structured format. 
+
+Follow these steps to identify the header pattern:
+1. Identify if the log samples fall under RFC5424 or RFC3164. If not, return 'Custom Format'.
+2. The log samples contain the header and structured body. The header may contain any or all of priority, timestamp, loglevel, hostname, ipAddress, messageId or any free-form text or non key-value information etc.,
+3. Make sure the regex and grok pattern matches all the header information. Only the structured message body should be under GREEDYDATA in grok pattern.
 
  You ALWAYS follow these guidelines when writing your response:
  <guidelines>
- - If you cannot match all the logs to the same RFC, return 'Custom Format' for RFC and provide the regex and grok patterns accordingly.
- - If the message part contains any unstructured data , make sure to add this in regex pattern and grok pattern.
  - Do not parse the message part in the regex. Just the header part should be in regex nad grok_pattern.
- - Make sure to map the remaining message part to \'message\' in grok pattern.
+ - Make sure to map the remaining message body to \'message\' in grok pattern.
  - Do not respond with anything except the processor as a JSON object enclosed with 3 backticks (\`), see example response above. Use strict JSON response format.
  </guidelines>
 
@@ -110,12 +114,15 @@ export const KV_HEADER_ERROR_PROMPT = ChatPromptTemplate.fromMessages([
 {errors}
 </errors>
 
- You ALWAYS follow these guidelines when writing your response:
+Follow these steps to fix the errors in the header pattern:
+1. Identify any mismatches, incorrect syntax, or logical errors in the pattern.
+2. The log samples contain the header and structured body. The header may contain any or all of priority, timestamp, loglevel, hostname, ipAddress, messageId or any free-form text or non key-value information etc.,
+3. The message body may start with a description, followed by structured key-value pairs.
+4. Make sure the regex and grok pattern matches all the header information. Only the structured message body should be under GREEDYDATA in grok pattern.
+You ALWAYS follow these guidelines when writing your response:
  <guidelines>
- - Identify any mismatches, incorrect syntax, or logical errors in the pattern.
- - If the message part contains any unstructured data , make sure to add this in grok pattern.
  - Do not parse the message part in the regex. Just the header part should be in regex nad grok_pattern.
- - Make sure to map the remaining message part to \'message\' in grok pattern.
+ - Make sure to map the remaining message body to \'message\' in grok pattern.
  - Do not respond with anything except the processor as a JSON object enclosed with 3 backticks (\`), see example response above. Use strict JSON response format.
  </guidelines>
 

diff --git a/x-pack/plugins/integration_assistant/server/graphs/kv/validate.ts b/x-pack/plugins/integration_assistant/server/graphs/kv/validate.ts
@@ -41,7 +41,7 @@ export async function handleKVValidate({
  )) as { pipelineResults: KVResult[]; errors: object[] };
 
  if (errors.length > 0) {
- return { errors, lastExecutedChain: 'kv_validate' };
+ return { errors, lastExecutedChain: 'kvValidate' };
  }
 
  // Converts JSON Object into a string and parses it as a array of JSON strings
@@ -56,7 +56,7 @@ export async function handleKVValidate({
  jsonSamples,
  additionalProcessors,
  errors: [],
- lastExecutedChain: 'kv_validate',
+ lastExecutedChain: 'kvValidate',
  };
 }
 
@@ -65,7 +65,7 @@ export async function handleHeaderValidate({
  client,
 }: HandleKVNodeParams): Promise<Partial<KVState>> {
  const grokPattern = state.grokPattern;
- const grokProcessor = createGrokProcessor(grokPattern);
+ const grokProcessor = createGrokProcessor([grokPattern]);
  const pipeline = { processors: grokProcessor, on_failure: [onFailure] };
 
  const { pipelineResults, errors } = (await testPipeline(state.logSamples, pipeline, client)) as {

diff --git a/x-pack/plugins/integration_assistant/server/graphs/log_type_detection/graph.ts b/x-pack/plugins/integration_assistant/server/graphs/log_type_detection/graph.ts
@@ -14,6 +14,7 @@ import { ESProcessorItem, SamplesFormat } from '../../../common';
 import { getKVGraph } from '../kv/graph';
 import { LogDetectionGraphParams, LogDetectionBaseNodeParams } from './types';
 import { LogFormat } from '../../constants';
+import { getUnstructuredGraph } from '../unstructured/graph';
 
 const graphState: StateGraphArgs<LogFormatDetectionState>['channels'] = {
  lastExecutedChain: {
@@ -90,9 +91,9 @@ function logFormatRouter({ state }: LogDetectionBaseNodeParams): string {
  if (state.samplesFormat.name === LogFormat.STRUCTURED) {
  return 'structured';
  }
- // if (state.samplesFormat === LogFormat.UNSTRUCTURED) {
- //  return 'unstructured';
- // }
+ if (state.samplesFormat.name === LogFormat.UNSTRUCTURED) {
+ return 'unstructured';
+ }
  // if (state.samplesFormat === LogFormat.CSV) {
  // return 'csv';
  // }
@@ -109,18 +110,19 @@ export async function getLogFormatDetectionGraph({ model, client }: LogDetection
  handleLogFormatDetection({ state, model })
  )
  .addNode('handleKVGraph', await getKVGraph({ model, client }))
- // .addNode('handleUnstructuredGraph', (state: LogFormatDetectionState) => getCompiledUnstructuredGraph({state, model}))
+ .addNode('handleUnstructuredGraph', await getUnstructuredGraph({ model, client }))
  // .addNode('handleCsvGraph', (state: LogFormatDetectionState) => getCompiledCsvGraph({state, model}))
  .addEdge(START, 'modelInput')
  .addEdge('modelInput', 'handleLogFormatDetection')
  .addEdge('handleKVGraph', 'modelOutput')
+ .addEdge('handleUnstructuredGraph', 'modelOutput')
  .addEdge('modelOutput', END)
  .addConditionalEdges(
  'handleLogFormatDetection',
  (state: LogFormatDetectionState) => logFormatRouter({ state }),
  {
  structured: 'handleKVGraph',
- // unstructured: 'handleUnstructuredGraph',
+ unstructured: 'handleUnstructuredGraph',
  // csv: 'handleCsvGraph',
  unsupported: 'modelOutput',
  }

diff --git a/x-pack/plugins/integration_assistant/server/graphs/log_type_detection/prompts.ts b/x-pack/plugins/integration_assistant/server/graphs/log_type_detection/prompts.ts
@@ -20,15 +20,19 @@ Here is some context for you to reference for your task, read it carefully as yo
  [
  'human',
  `Looking at the log samples , our goal is to identify the syslog type based on the guidelines below.
+Follow these steps to identify the log format type:
+1. Go through each log sample and identify the log format type.
+2. If the samples have any or all of priority, timestamp, loglevel, hostname, ipAddress, messageId in the beginning information then set "header: true".
+3. If the samples have a syslog header then set "header: true" , else set "header: false". If you are unable to determine the syslog header presence then set "header: false".
+4. If the log samples have structured message body with key-value pairs then classify it as "log_type: structured". Look for a flat list of key-value pairs, often separated by spaces, commas, or other delimiters.
+5. Consider variations in formatting, such as quotes around values ("key=value", key="value"), special characters in keys or values, or escape sequences.
+6. If the log samples have unstructured body like a free-form text then classify it as "log_type: unstructured".
+7. If the log samples follow a csv format then classify it as "log_type: csv".
+8. If the samples are identified as "csv" and there is a csv header then set "header: true" , else set "header: false".
+9. If you do not find the log format in any of the above categories then classify it as "log_type: unsupported".
+
+ You ALWAYS follow these guidelines when writing your response:
 <guidelines>
-- Go through each log sample and identify the log format type.
-- If the samples have a timestamp , loglevel in the beginning information then set "header: true".
-- If the samples have a syslog header then set "header: true" , else set "header: false". If you are unable to determine the syslog header presence then set "header: false".
-- If the syslog samples have structured body then classify it as "log_type: structured".
-- If the syslog samples have unstructured body then classify it as "log_type: unstructured".
-- If the syslog samples follow a csv format then classify it as "log_type: csv".
-- If the samples are identified as "csv" and there is a csv header then set "header: true" , else set "header: false".
-- If you do not find the log format in any of the above categories then classify it as "log_type: unsupported".
 - Do not respond with anything except the updated current mapping JSON object enclosed with 3 backticks (\`). See example response below.
 </guidelines>
 

diff --git a/x-pack/plugins/integration_assistant/server/graphs/unstructured/constants.ts b/x-pack/plugins/integration_assistant/server/graphs/unstructured/constants.ts
@@ -0,0 +1,27 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const GROK_EXAMPLE_ANSWER = {
+ rfc: 'RFC2454',
+ regex:
+ '/(?:(d{4}[-]d{2}[-]d{2}[T]d{2}[:]d{2}[:]d{2}(?:.d{1,6})?(?:[+-]d{2}[:]d{2}|Z)?)|-)s(?:([w][wd.@-]*)|-)s(.*)$/',
+ grok_patterns: ['%{WORD:key1}:%{WORD:value1};%{WORD:key2}:%{WORD:value2}:%{GREEDYDATA:message}'],
+};
+
+export const GROK_ERROR_EXAMPLE_ANSWER = {
+ grok_patterns: [
+ '%{TIMESTAMP:timestamp}:%{WORD:value1};%{WORD:key2}:%{WORD:value2}:%{GREEDYDATA:message}',
+ ],
+};
+
+export const onFailure = {
+ append: {
+ field: 'error.message',
+ value:
+ '{% raw %}Processor {{{_ingest.on_failure_processor_type}}} with tag {{{_ingest.on_failure_processor_tag}}} in pipeline {{{_ingest.on_failure_pipeline}}} failed with message: {{{_ingest.on_failure_message}}}{% endraw %}',
+ },
+};
diff --git a/x-pack/plugins/integration_assistant/server/graphs/unstructured/error.ts b/x-pack/plugins/integration_assistant/server/graphs/unstructured/error.ts
@@ -0,0 +1,32 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { JsonOutputParser } from '@langchain/core/output_parsers';
+import type { UnstructuredLogState } from '../../types';
+import type { HandleUnstructuredNodeParams } from './types';
+import { GROK_ERROR_PROMPT } from './prompts';
+import { GROK_ERROR_EXAMPLE_ANSWER } from './constants';
+
+export async function handleUnstructuredError({
+ state,
+ model,
+}: HandleUnstructuredNodeParams): Promise<Partial<UnstructuredLogState>> {
+ const outputParser = new JsonOutputParser();
+ const grokErrorGraph = GROK_ERROR_PROMPT.pipe(model).pipe(outputParser);
+ const currentPatterns = state.grokPatterns;
+
+ const pattern = await grokErrorGraph.invoke({
+ current_pattern: JSON.stringify(currentPatterns, null, 2),
+ errors: JSON.stringify(state.errors, null, 2),
+ ex_answer: JSON.stringify(GROK_ERROR_EXAMPLE_ANSWER, null, 2),
+ });
+
+ return {
+ grokPatterns: pattern.grok_patterns,
+ lastExecutedChain: 'unstructuredError',
+ };
+}
diff --git a/x-pack/plugins/integration_assistant/server/graphs/unstructured/errors.test.ts b/x-pack/plugins/integration_assistant/server/graphs/unstructured/errors.test.ts
@@ -0,0 +1,40 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { FakeLLM } from '@langchain/core/utils/testing';
+import { handleUnstructuredError } from './error';
+import type { UnstructuredLogState } from '../../types';
+import {
+ unstructuredLogState,
+ unstructuredLogResponse,
+} from '../../../__jest__/fixtures/unstructured';
+import {
+ ActionsClientChatOpenAI,
+ ActionsClientSimpleChatModel,
+} from '@kbn/langchain/server/language_models';
+import { IScopedClusterClient } from '@kbn/core-elasticsearch-server';
+
+const model = new FakeLLM({
+ response: JSON.stringify(unstructuredLogResponse, null, 2),
+}) as unknown as ActionsClientChatOpenAI | ActionsClientSimpleChatModel;
+
+const state: UnstructuredLogState = unstructuredLogState;
+
+describe('Testing unstructured error handling node', () => {
+ const client = {
+ asCurrentUser: {
+ ingest: {
+ simulate: jest.fn(),
+ },
+ },
+ } as unknown as IScopedClusterClient;
+ it('handleUnstructuredError()', async () => {
+ const response = await handleUnstructuredError({ state, model, client });
+ expect(response.grokPatterns).toStrictEqual(unstructuredLogResponse.grok_patterns);
+ expect(response.lastExecutedChain).toBe('unstructuredError');
+ });
+});
diff --git a/x-pack/plugins/integration_assistant/server/graphs/unstructured/graph.test.ts b/x-pack/plugins/integration_assistant/server/graphs/unstructured/graph.test.ts
@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import {
+ ActionsClientChatOpenAI,
+ ActionsClientSimpleChatModel,
+} from '@kbn/langchain/server/language_models';
+import { FakeLLM } from '@langchain/core/utils/testing';
+import { getUnstructuredGraph } from './graph';
+import { IScopedClusterClient } from '@kbn/core-elasticsearch-server';
+
+const model = new FakeLLM({
+ response: '{"log_type": "structured"}',
+}) as unknown as ActionsClientChatOpenAI | ActionsClientSimpleChatModel;
+
+describe('UnstructuredGraph', () => {
+ const client = {
+ asCurrentUser: {
+ ingest: {
+ simulate: jest.fn(),
+ },
+ },
+ } as unknown as IScopedClusterClient;
+ describe('Compiling and Running', () => {
+ it('Ensures that the graph compiles', async () => {
+ // When getUnstructuredGraph runs, langgraph compiles the graph it will error if the graph has any issues.
+ // Common issues for example detecting a node has no next step, or there is a infinite loop between them.
+ try {
+ await getUnstructuredGraph({ model, client });
+ } catch (error) {
+ fail(`getUnstructuredGraph threw an error: ${error}`);
+ }
+ });
+ });
+});