camunda · sbuettner · Oct 17, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/...ors/idp-extraction/element-templates/hybrid/idp-extraction-outbound-connector-hybrid.json b/...ors/idp-extraction/element-templates/hybrid/idp-extraction-outbound-connector-hybrid.json
diff --git a/connectors/idp-extraction/pom.xml b/connectors/idp-extraction/pom.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.camunda.connector</groupId>
+ <artifactId>connectors-parent</artifactId>
+ <version>8.7.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>connector-idp-extraction</artifactId>
+ <name>connector-idp-extraction</name>
+ <description>Camunda IDP extraction outbound Connector</description>
+ <packaging>jar</packaging>
+
+ <properties>
+ <maven.compiler.source>21</maven.compiler.source>
+ <maven.compiler.target>21</maven.compiler.target>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>3.0.3</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>io.camunda.connector</groupId>
+ <artifactId>element-template-generator-maven-plugin</artifactId>
+ <version>${project.version}</version>
+ <configuration>
+ <connectors>
+ <connector>
+ <connectorClass>io.camunda.connector.idp.extraction.ExtractionConnectorFunction</connectorClass>
+ <files>
+ <file>
+ <templateId>io.camunda.connectors.idp.extraction.v1</templateId>
+ <templateFileName>idp-extraction-outbound-connector.json</templateFileName>
+ </file>
+ </files>
+ <generateHybridTemplates>true</generateHybridTemplates>
+ </connector>
+ </connectors>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
@@ -0,0 +1,116 @@
+/*
+ * Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH
+ * under one or more contributor license agreements. Licensed under a proprietary license.
+ * See the License.txt file for more information. You may not use this file
+ * except in compliance with the proprietary license.
+ */
+package io.camunda.connector.idp.extraction;
+
+import io.camunda.connector.api.annotation.OutboundConnector;
+import io.camunda.connector.api.error.ConnectorException;
+import io.camunda.connector.api.outbound.OutboundConnectorContext;
+import io.camunda.connector.api.outbound.OutboundConnectorFunction;
+import io.camunda.connector.generator.java.annotation.ElementTemplate;
+import io.camunda.connector.idp.extraction.caller.BedrockCaller;
+import io.camunda.connector.idp.extraction.caller.PollingTextractCaller;
+import io.camunda.connector.idp.extraction.model.ExtractionRequest;
+import io.camunda.connector.idp.extraction.model.ExtractionResult;
+import io.camunda.connector.idp.extraction.supplier.BedrockRuntimeClientSupplier;
+import io.camunda.connector.idp.extraction.supplier.S3ClientSupplier;
+import io.camunda.connector.idp.extraction.supplier.TextractClientSupplier;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URI;
+import java.net.URL;
+
+@OutboundConnector(
+ name = "IDP extraction outbound Connector",
+ inputVariables = {"authentication", "configuration", "input"},
+ type = "io.camunda:idp-extraction-connector-template:1")
+@ElementTemplate(
+ id = "io.camunda.connector.IdpExtractionOutBoundTemplate.v1",
+ name = "IDP extraction outbound Connector",
+ version = 1,
+ description = "Execute IDP extraction requests",
+ icon = "icon.svg",
+ documentationRef = "https://docs.camunda.io/docs/guides/",
+ propertyGroups = {
+ @ElementTemplate.PropertyGroup(id = "authentication", label = "Authentication"),
+ @ElementTemplate.PropertyGroup(id = "configuration", label = "Configuration"),
+ @ElementTemplate.PropertyGroup(id = "input", label = "Input message data")
+ },
+ inputDataClass = ExtractionRequest.class)
+public class ExtractionConnectorFunction implements OutboundConnectorFunction {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ExtractionConnectorFunction.class);
+
+ private final TextractClientSupplier textractClientSupplier;
+
+ private final S3ClientSupplier s3ClientSupplier;
+
+ private final BedrockRuntimeClientSupplier bedrockRuntimeClientSupplier;
+
+ private final PollingTextractCaller pollingTextractCaller;
+
+ private final BedrockCaller bedrockCaller;
+
+ public ExtractionConnectorFunction() {
+ this.textractClientSupplier = new TextractClientSupplier();
+ this.s3ClientSupplier = new S3ClientSupplier();
+ this.bedrockRuntimeClientSupplier = new BedrockRuntimeClientSupplier();
+ this.pollingTextractCaller = new PollingTextractCaller();
+ this.bedrockCaller = new BedrockCaller();
+ }
+
+ public ExtractionConnectorFunction(PollingTextractCaller pollingTextractCaller, BedrockCaller bedrockCaller) {
+ this.textractClientSupplier = new TextractClientSupplier();
+ this.s3ClientSupplier = new S3ClientSupplier();
+ this.bedrockRuntimeClientSupplier = new BedrockRuntimeClientSupplier();
+ this.pollingTextractCaller = pollingTextractCaller;
+ this.bedrockCaller = bedrockCaller;
+ }
+
+ @Override
+ public Object execute(OutboundConnectorContext context) {
+ final var extractionRequest = context.bindVariables(ExtractionRequest.class);
+
+ try {
+ String extractedText = switch (extractionRequest.getInput().extractionEngineType()) {
+ case AWS_TEXTRACT -> extractTextUsingAwsTextract(extractionRequest);
+ case APACHE_PDFBOX -> extractTextUsingApachePdf(extractionRequest);
+ };
+
+ String bedrockResponse = bedrockCaller.call(
+ extractionRequest,
+ extractedText,
+ bedrockRuntimeClientSupplier.getBedrockRuntimeClient(extractionRequest));
+
+ return new ExtractionResult(bedrockResponse);
+ } catch (Exception e) {
+ LOGGER.error("Document extraction failed: {}", e.getMessage());
+ throw new ConnectorException(e);
+ }
+ }
+
+ private String extractTextUsingAwsTextract(ExtractionRequest extractionRequest) throws Exception {
+ return pollingTextractCaller.call(
+ extractionRequest.getInput().documentUrl(),
+ extractionRequest.getInput().s3BucketName(),
+ textractClientSupplier.getTextractClient(extractionRequest),
+ s3ClientSupplier.getAsyncS3Client(extractionRequest)
+ );
+ }
+
+ private String extractTextUsingApachePdf(ExtractionRequest extractionRequest) throws Exception {
+ String documentUrl = extractionRequest.getInput().documentUrl();
+ URL url = URI.create(documentUrl).toURL();
+ PDDocument document = Loader.loadPDF(IOUtils.toByteArray(url.openStream()));
+ PDFTextStripper pdfStripper = new PDFTextStripper();
+ return pdfStripper.getText(document);
+ }
+}
@@ -0,0 +1,85 @@
+/*
+ * Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH
+ * under one or more contributor license agreements. Licensed under a proprietary license.
+ * See the License.txt file for more information. You may not use this file
+ * except in compliance with the proprietary license.
+ */
+package io.camunda.connector.idp.extraction.caller;
+
+import io.camunda.connector.idp.extraction.model.ConverseData;
+import io.camunda.connector.idp.extraction.model.ExtractionRequest;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import software.amazon.awssdk.services.bedrockruntime.BedrockRuntimeClient;
+import software.amazon.awssdk.services.bedrockruntime.model.ContentBlock;
+import software.amazon.awssdk.services.bedrockruntime.model.ConversationRole;
+import software.amazon.awssdk.services.bedrockruntime.model.ConverseResponse;
+import software.amazon.awssdk.services.bedrockruntime.model.Message;
+
+import java.util.stream.Collectors;
+
+public class BedrockCaller {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(BedrockCaller.class);
+
+ private static final String EXTRACTED_TEXT_PLACEHOLDER_FOR_PROMPT = "{{extractedText}}";
+
+ private static final String TAXONOMY_PLACEHOLDER_FOR_PROMPT = "{{taxonomy}}";
+
+ private static final String SYSTEM_PROMPT_TEMPLATE = """
+ You will receive extracted text from a PDF document. This text will be between the <DOCUMENT_TEXT> tags.
+ Your task is to extract certain variables from the text. The description how to extract the variables is
+ between the <EXTRACTION> tags. Every variable is represented by a <VAR> tag. Every variable has a name,
+ which is represented by the <NAME> tag, as well as instructions which data to extract, which is represented
+ by the <PROMPT> tag.
+
+ Respond in JSON format, without any preamble. Example response:
+ {
+ "name": "John Smith",
+ "age": 32
+ }
+
+ Here is the document text as well as your instructions on which variables to extract:
+ <DOCUMENT_TEXT>%s</DOCUMENT_TEXT>
+ <EXTRACTION>%s</EXTRACTION>
+ """.formatted(EXTRACTED_TEXT_PLACEHOLDER_FOR_PROMPT, TAXONOMY_PLACEHOLDER_FOR_PROMPT);
+
+ private static final String SYSTEM_PROMPT_VARIABLE_TEMPLATE = """
+ <VAR>
+ <NAME>%s</NAME>
+ <PROMPT>%s</PROMPT>
+ </VAR>
+ """;
+
+ public String call(
+ ExtractionRequest extractionRequest,
+ String extractedText,
+ BedrockRuntimeClient bedrockRuntimeClient) {
+ LOGGER.debug("Calling AWS Bedrock model with extraction request: {}", extractionRequest);
+
+ String taxonomyItems = extractionRequest.getInput().taxonomyItems()
+ .stream()
+ .map(item -> String.format(SYSTEM_PROMPT_VARIABLE_TEMPLATE, item.name(), item.prompt()))
+ .collect(Collectors.joining());
+
+ String prompt = SYSTEM_PROMPT_TEMPLATE
+ .replace(EXTRACTED_TEXT_PLACEHOLDER_FOR_PROMPT, extractedText)
+ .replace(TAXONOMY_PLACEHOLDER_FOR_PROMPT, taxonomyItems);
+
+ Message message = Message.builder()
+ .content(ContentBlock.fromText(prompt))
+ .role(ConversationRole.USER)
+ .build();
+
+ ConverseData converseData = extractionRequest.getInput().converseData();
+ ConverseResponse response = bedrockRuntimeClient.converse(request -> request
+ .modelId(converseData.modelId())
+ .messages(message)
+ .inferenceConfig(config -> config
+ .maxTokens(converseData.maxTokens())
+ .temperature(converseData.temperature())
+ .topP(converseData.topP())));
+
+ return response.output().message().content().getFirst().text();
+ }
+}
@@ -0,0 +1,108 @@
+/*
+ * Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH
+ * under one or more contributor license agreements. Licensed under a proprietary license.
+ * See the License.txt file for more information. You may not use this file
+ * except in compliance with the proprietary license.
+ */
+package io.camunda.connector.idp.extraction.caller;
+
+import io.camunda.connector.api.error.ConnectorException;
+import io.camunda.connector.idp.extraction.model.TextractTask;
+import io.camunda.connector.idp.extraction.utils.AwsS3Util;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import software.amazon.awssdk.services.s3.S3AsyncClient;
+import software.amazon.awssdk.services.textract.TextractClient;
+import software.amazon.awssdk.services.textract.model.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.stream.Collectors;
+
+import static java.util.concurrent.TimeUnit.SECONDS;
+
+public class PollingTextractCaller {
+ public static final long DELAY_BETWEEN_POLLING = 5;
+
+ public static final int MAX_RESULT = 1000;
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(PollingTextractCaller.class);
+
+ public String call(
+ String documentUrl,
+ String bucketName,
+ TextractClient textractClient,
+ S3AsyncClient s3AsyncClient
+ ) throws Exception {
+
+ S3Object s3Object = AwsS3Util.buildS3ObjectFromUrl(documentUrl, bucketName, s3AsyncClient);
+
+ LOGGER.debug("Starting polling task for document analysis with document: {}", s3Object.name());
+
+ List<FeatureType> featureTypes = new ArrayList<>();
+ featureTypes.add(FeatureType.FORMS);
+ featureTypes.add(FeatureType.TABLES);
+
+ final StartDocumentAnalysisRequest startDocumentAnalysisRequest =
+ StartDocumentAnalysisRequest.builder()
+ .featureTypes(featureTypes)
+ .documentLocation(AwsS3Util.buildDocumentLocation(s3Object)).build();
+
+ final StartDocumentAnalysisResponse response = textractClient.startDocumentAnalysis(startDocumentAnalysisRequest);
+
+ List<Block> allBlocks;
+ try (ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor()) {
+ final String jobId = response.jobId();
+ final TextractTask firstTextractTask = prepareTextractTask(jobId, textractClient);
+ final GetDocumentAnalysisResponse firstDocumentResult = executeTask(firstTextractTask, 0, executorService);
+
+ allBlocks = new ArrayList<>(firstDocumentResult.blocks());
+ boolean isAnalysisFinished = firstDocumentResult.jobStatus().equals(JobStatus.SUCCEEDED);
+
+ while (!isAnalysisFinished) {
+ final TextractTask nextTextractTask = prepareTextractTask(jobId, textractClient);
+ GetDocumentAnalysisResponse nextDocumentResult =
+ executeTask(nextTextractTask, DELAY_BETWEEN_POLLING, executorService);
+ JobStatus newJobStatus = nextDocumentResult.jobStatus();
+
+ switch (newJobStatus) {
+ case SUCCEEDED -> {
+ isAnalysisFinished = true;
+ allBlocks.addAll(nextDocumentResult.blocks());
+ }
+ case FAILED -> throw new ConnectorException(nextDocumentResult.statusMessage());
+ default -> {
+ allBlocks.addAll(nextDocumentResult.blocks());
+ }
+ }
+ }
+ }
+
+ AwsS3Util.deleteS3ObjectFromBucketAsync(s3Object.name(), bucketName, s3AsyncClient);
+
+ return allBlocks.stream()
+ .filter(block -> block.blockType().equals(BlockType.LINE))
+ .map(Block::text)
+ .collect(Collectors.joining("\n"));
+ }
+
+ private TextractTask prepareTextractTask(String jobId, TextractClient textractClient) {
+ GetDocumentAnalysisRequest documentAnalysisRequest = GetDocumentAnalysisRequest
+ .builder()
+ .jobId(jobId)
+ .maxResults(MAX_RESULT)
+ .build();
+
+ return new TextractTask(documentAnalysisRequest, textractClient);
+ }
+
+ private GetDocumentAnalysisResponse executeTask(
+ TextractTask task, long delay, ScheduledExecutorService executorService) throws Exception {
+ ScheduledFuture<GetDocumentAnalysisResponse> nextDocumentResultFuture =
+ executorService.schedule(task, delay, SECONDS);
+ return nextDocumentResultFuture.get();
+ }
+}