From e7e0b551c297c70ce21ab2a4dc5281909b8829de Mon Sep 17 00:00:00 2001
From: Shivansh Shalabh <65492037+ShivanshShalabh@users.noreply.github.com>
Date: Fri, 28 Jun 2024 22:30:13 -0400
Subject: [PATCH] Completed imageToText (#21)
---
.../QuickInput/QuickInput.stories.js | 14 ++++-
.../Outputs/ImageToText/ImageToText.scss | 7 +++
.../Outputs/ImageToText/ImageToTextOutput.js | 29 ++++++++++
.../ImageToText/ImageToTextOutput.stories.js | 13 +++++
.../ImageToText/ImageToTextOutput.test.js | 0
.../ImageToTextOutputInputSection.js | 29 ++++++++++
.../testData/testImageToTextOutput.js | 29 ++++++++++
.../QuickOutput/Outputs/Text/TextOutput.scss | 4 +-
src/helpers/DefaultModels.js | 53 ++++++++++++++++++-
src/helpers/Task.js | 36 +++++++++++--
src/helpers/TaskIDs.js | 1 +
src/helpers/UppyFileTypeCheckerPlugin.js | 4 +-
src/helpers/sampleImages.js | 15 ++++++
src/resources/icons/icon-imageToText.svg | 1 +
14 files changed, 225 insertions(+), 10 deletions(-)
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToText.scss
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.js
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.stories.js
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.test.js
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutputInputSection.js
create mode 100644 src/components/Experiment/QuickOutput/Outputs/ImageToText/testData/testImageToTextOutput.js
create mode 100644 src/resources/icons/icon-imageToText.svg
diff --git a/src/components/Experiment/QuickInput/QuickInput.stories.js b/src/components/Experiment/QuickInput/QuickInput.stories.js
index 7b3875c8..d4df3478 100644
--- a/src/components/Experiment/QuickInput/QuickInput.stories.js
+++ b/src/components/Experiment/QuickInput/QuickInput.stories.js
@@ -17,6 +17,7 @@ import {
textToImage,
textToVideo,
imageTo3D,
+ imageToText,
} from "../../../helpers/TaskIDs";
import {
SampleImageClassificationInputs,
@@ -29,7 +30,8 @@ import {
SampleTextGuidedImageToImageInputs,
SampleDocumentQuestionAnsweringInputs,
SampleTextToImage,
- SampleTextToVideo
+ SampleTextToVideo,
+ SampleImageToText
} from "../../../helpers/sampleImages";
export default {
@@ -226,3 +228,13 @@ TextToVideo.args = {
},
},
};
+
+export const ImageToText = Template.bind({});
+ImageToText.args = {
+ sampleInputs: SampleImageToText,
+ model: {
+ output: {
+ type: imageToText,
+ },
+ },
+};
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToText.scss b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToText.scss
new file mode 100644
index 00000000..63dc6721
--- /dev/null
+++ b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToText.scss
@@ -0,0 +1,7 @@
+.image-to-text-output {
+ &__input-image-content {
+ img {
+ margin-top: 12px;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.js b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.js
new file mode 100644
index 00000000..437e4bc7
--- /dev/null
+++ b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.js
@@ -0,0 +1,29 @@
+import React from "react";
+import useBEMNaming from "../../../../../common/useBEMNaming";
+import useTextOutput from "../Text/useTextOutput";
+import ImageToTextOutputInputSection from "./ImageToTextOutputInputSection";
+import { TextOutputBox } from "../Text/TextOutputBox";
+import { imageToText } from "../../../../../helpers/TaskIDs";
+
+export default function ImageToTextOutput(props) {
+ const { getBlock } = useBEMNaming("image-to-text-output");
+ const { output, inferenceDuration, input, setInput } = useTextOutput(
+ props.trial
+ );
+
+ const onSubmit = () => {
+ props.onSubmit(input);
+ };
+
+ return (
+
+
+
+
+
+ );
+}
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.stories.js b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.stories.js
new file mode 100644
index 00000000..bd2c38dc
--- /dev/null
+++ b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.stories.js
@@ -0,0 +1,13 @@
+import React from "react";
+import ImageToTextOutput from "./ImageToTextOutput";
+import { TestImageToTextOutput } from "./testData/testImageToTextOutput";
+
+export default {
+ title: "Experiments/Quick Output/Image To Text",
+ component: ImageToTextOutput,
+};
+
+const template = (args) => ;
+
+export const Default = template.bind({});
+Default.args = { trial: TestImageToTextOutput };
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.test.js b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutput.test.js
new file mode 100644
index 00000000..e69de29b
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutputInputSection.js b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutputInputSection.js
new file mode 100644
index 00000000..3f351f8c
--- /dev/null
+++ b/src/components/Experiment/QuickOutput/Outputs/ImageToText/ImageToTextOutputInputSection.js
@@ -0,0 +1,29 @@
+import React from "react";
+import useBEMNaming from "../../../../../common/useBEMNaming";
+import "./ImageToText.scss"
+export default function ImageToTextOutputInputSection(props) {
+ const { getElement } = useBEMNaming("image-to-text-output");
+ const input = props.input;
+
+ return (
+
+
+ Input Image
+
+
+
+
+ The uploaded image file:
+
+
+
+
+
+
+ );
+}
diff --git a/src/components/Experiment/QuickOutput/Outputs/ImageToText/testData/testImageToTextOutput.js b/src/components/Experiment/QuickOutput/Outputs/ImageToText/testData/testImageToTextOutput.js
new file mode 100644
index 00000000..6fcaa410
--- /dev/null
+++ b/src/components/Experiment/QuickOutput/Outputs/ImageToText/testData/testImageToTextOutput.js
@@ -0,0 +1,29 @@
+export const TestImageToTextOutputGeneratedToken = {
+ id: "sampletestimagetotextoutputidhere"
+};
+
+export const TestImageToTextOutput = {
+ id: "sampletestimagetotextoutputidhere",
+ inputs: [
+ {
+ src: "https://s3.amazonaws.com/uploads.staging.mlmodelscope.org/crabby.png",
+ alt: "crab"
+ },
+ ],
+ completed_at: "2023-06-03T18:17:14.513854Z",
+ results: {
+ 'duration': "9.216154124s",
+ 'duration_for_inference': "9.193807904s",
+ 'responses': [
+ {
+ 'features': [
+ {
+ 'text': 'A crab on the beach',
+ 'type': 'TEXT'
+ }
+ ],
+ 'id': "sampletestimagetotextoutputresponseidhere"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/src/components/Experiment/QuickOutput/Outputs/Text/TextOutput.scss b/src/components/Experiment/QuickOutput/Outputs/Text/TextOutput.scss
index 0562b3b6..3050a1fd 100644
--- a/src/components/Experiment/QuickOutput/Outputs/Text/TextOutput.scss
+++ b/src/components/Experiment/QuickOutput/Outputs/Text/TextOutput.scss
@@ -2,7 +2,7 @@
.text-output, .text-to-code-output,
.audio-to-text-output, .text-to-audio-output,
-.text-conversation-output {
+.text-conversation-output,.image-to-text-output {
display: flex;
flex-direction: row;
gap: 72px;
@@ -42,7 +42,7 @@
}
- &-audio-content {
+ &-audio-content, &-image-content {
padding-top: 12px;
font-size: 20px;
}
diff --git a/src/helpers/DefaultModels.js b/src/helpers/DefaultModels.js
index 20aec52e..25baaf5a 100644
--- a/src/helpers/DefaultModels.js
+++ b/src/helpers/DefaultModels.js
@@ -1,4 +1,4 @@
-import { audioToText, documentQuestionAnswering, styleTransfer, textGuidedImageToImage, textToAudio, textToImage, textToText, textToVideo, visualQuestionAnswering } from "./TaskIDs";
+import { audioToText, documentQuestionAnswering, styleTransfer, textGuidedImageToImage, textToAudio, textToImage, textToText, textToVideo, visualQuestionAnswering,imageToText } from "./TaskIDs";
export const DefaultImageClassificationModel = {
id: 1,
@@ -801,4 +801,55 @@ export const DefaultTextToVideo = {
link2: "",
},
version: "1.0",
+};
+export const DefaultImageToText = {
+ id: 190,
+ created_at: "2022-04-29T20:48:47.370171Z",
+ updated_at: "2022-04-29T20:48:47.370171Z",
+ attributes: {
+ Top1: "",
+ Top5: "",
+ kind: "CNN",
+ manifest_author: "Jingning Tang",
+ training_dataset: "PASCAL VOC 2012",
+ },
+ description:
+ "TensorFlow Chatbot model, which is trained on the COCO (Common Objects in Context) dataset. Use deeplabv3_mnv2_dm05_pascal_train_aug(deeplabv3_mnv2_dm05_pascal_train_aug_2018_10_01) from TensorFlow DeepLab Model Zoo.\n",
+ short_description:
+ "DeepLabv3 is a deep convolutional neural networks for semantic chatbotness. It employ atrous convolution in cascade or in parallel to capture multi-scale context by adopting multiple atrous rates.",
+ model: {
+ graph_checksum: "0336ceb67b378df8ada0efe9eadb5ac8",
+ graph_path:
+ "https://s3.amazonaws.com/store.carml.org/models/tensorflow/models/deeplabv3_mnv2_dm05_pascal_train_aug_2018_10_01/frozen_inference_graph.pb",
+ weights_checksum: "",
+ weights_path: "",
+ },
+ framework: {
+ id: 4,
+ name: "TensorFlow",
+ version: "1.14.0",
+ architectures: [
+ {
+ name: "amd64",
+ },
+ ],
+ },
+ input: {
+ description: "text to be responded to",
+ type: "text",
+ },
+ license: "Apache License, Version 2.0",
+ name: "DeepLabv3_MobileNet_v2_DM_05_PASCAL_VOC_Train_Aug",
+ output: {
+ description: "the chatbot's response to the inputted text",
+ type: imageToText,
+ },
+ url: {
+ github:
+ "https://github.com/rai-project/tensorflow/blob/master/builtin_models/DeepLabv3_MobileNet_v2_DM_05_PASCAL_VOC_Train_Aug.yml",
+ citation: "https://arxiv.org/pdf/1802.02611v3.pdf",
+ link1: "https://arxiv.org/pdf/1706.05587.pdf",
+ link2: "",
+ },
+ version: "1.0",
};
\ No newline at end of file
diff --git a/src/helpers/Task.js b/src/helpers/Task.js
index a604ec5b..0d343808 100644
--- a/src/helpers/Task.js
+++ b/src/helpers/Task.js
@@ -15,7 +15,8 @@ import {
textGuidedImageToImage,
documentQuestionAnswering,
textToImage,
- textToVideo
+ textToVideo,
+ imageToText
} from "./TaskIDs";
import React from "react";
import { ReactComponent as ImageClassification } from "../resources/icons/icon-imageClassification.svg";
@@ -34,6 +35,8 @@ import { ReactComponent as VisualQuestionAnswering } from "../resources/icons/ic
import { ReactComponent as TextGuidedImageToImage } from "../resources/icons/icon-textGuidedImagetoImage.svg";
import { ReactComponent as TexttoImage } from "../resources/icons/icon-textToImage.svg";
import { ReactComponent as TexttoVideo } from "../resources/icons/icon-textToVideo.svg";
+import { ReactComponent as ImageToText } from "../resources/icons/icon-imageToText.svg";
+
import {
DefaultImageClassificationModel,
@@ -51,7 +54,8 @@ import {
DefaultDocumentQuestionAnsweringModel,
DefaultTextToImage,
DefaultTextToVideo,
- DefaultImageTo3DModel
+ DefaultImageTo3DModel,
+ DefaultImageToText
} from "./DefaultModels";
import {
SampleAudioToTextInputs,
@@ -64,7 +68,8 @@ import {
SampleVisualQuestionAnsweringInputs,
SampleDocumentQuestionAnsweringInputs,
SampleTextToImage,
- SampleTextToVideo
+ SampleTextToVideo,
+ SampleImageToText
} from "./sampleImages";
import { TestImageClassificationResult } from "../components/Experiment/QuickOutput/Outputs/Classification/Features";
@@ -85,6 +90,7 @@ import { TestTextGuidedImageToImage } from "../components/Experiment/QuickOutput
import { TestTextToImageOutput } from "../components/Experiment/QuickOutput/Outputs/TextToImage/testData/testTextToImageOutput";
import { TestTextToVideoOutput } from "../components/Experiment/QuickOutput/Outputs/TextToVideo/testData/testTextToVideoOutput";
import { TestImageTo3DOutput } from "../components/Experiment/QuickOutput/Outputs/ImageTo3D/testData/testImageTo3DOutput";
+import { TestImageToTextOutput } from "../components/Experiment/QuickOutput/Outputs/ImageToText/testData/testImageToTextOutput";
export default class Task {
@@ -213,7 +219,7 @@ export default class Task {
inputType: TaskInputTypes.Image,
},
],
-
+
outputText: "3D model generated from the uploaded images",
icon: (props) => ,
sampleInputs: [],
@@ -417,6 +423,19 @@ export default class Task {
inputType: TaskInputTypes.Text,
});
+ static image_to_text = new Task({
+ name: "Image to Text",
+ description: "Caption an image.",
+ id: imageToText,
+ inputText: "Generate a caption for the image.",
+ outputText: "Caption:",
+ icon: (props) => ,
+ sampleInputs: SampleImageToText,
+ tutorialDescription: "Image to Text model generates a caption for an image.",
+ inputType: TaskInputTypes.Image,
+ });
+
+
constructor(options) {
this.name = options.name ?? "";
this.id = options.id ?? this.name;
@@ -479,6 +498,8 @@ export default class Task {
return Task.text_to_image;
case textToVideo:
return Task.text_to_video;
+ case imageToText:
+ return Task.image_to_text;
default:
return new Task({ name: "unknown", description: "unknown task name" });
}
@@ -521,6 +542,8 @@ export default class Task {
return DefaultTextToImage;
case textToVideo:
return DefaultTextToVideo;
+ case imageToText:
+ return DefaultImageToText;
default:
return undefined;
}
@@ -562,6 +585,8 @@ export default class Task {
return TestTextToImageOutput;
case textToVideo:
return TestTextToVideoOutput;
+ case imageToText:
+ return TestImageToTextOutput;
default:
return undefined;
}
@@ -586,7 +611,8 @@ export default class Task {
this.getStaticTask(textGuidedImageToImage),
this.getStaticTask(documentQuestionAnswering),
this.getStaticTask(textToImage),
- this.getStaticTask(textToVideo)
+ this.getStaticTask(textToVideo),
+ this.getStaticTask(imageToText)
];
}
diff --git a/src/helpers/TaskIDs.js b/src/helpers/TaskIDs.js
index 467a225c..8d815241 100644
--- a/src/helpers/TaskIDs.js
+++ b/src/helpers/TaskIDs.js
@@ -15,4 +15,5 @@ export const textGuidedImageToImage = "text_guided_image_to_image";
export const documentQuestionAnswering = "document_question_answering";
export const textToImage = "text_to_image"
export const textToVideo = "text_to_video"
+export const imageToText = "image_to_text"
export const pending = "pending";
diff --git a/src/helpers/UppyFileTypeCheckerPlugin.js b/src/helpers/UppyFileTypeCheckerPlugin.js
index a602a181..1f59b334 100644
--- a/src/helpers/UppyFileTypeCheckerPlugin.js
+++ b/src/helpers/UppyFileTypeCheckerPlugin.js
@@ -11,7 +11,8 @@ import {
styleTransfer,
imageTo3D,
textGuidedImageToImage,
- visualQuestionAnswering
+ visualQuestionAnswering,
+ imageToText
} from './TaskIDs';
import fileTypeChecker from "file-type-checker";
@@ -31,6 +32,7 @@ export const getAllowedFileTypes = (task) => {
case styleTransfer:
case imageTo3D:
case instance_segmentation:
+ case imageToText:
return {
fileTypes: ['bmp', 'gif', 'ico', 'jpeg', 'pdf', 'png', 'psd'],
mimeTypes: ['image/*']
diff --git a/src/helpers/sampleImages.js b/src/helpers/sampleImages.js
index d1f83d10..311e110f 100644
--- a/src/helpers/sampleImages.js
+++ b/src/helpers/sampleImages.js
@@ -215,4 +215,19 @@ export const SampleTextToVideo = [
"Cat and dog playing",
"Flower in a garden",
"Sunset on a beach"
+];
+
+export const SampleImageToText = [
+ {
+ src: "https://s3.amazonaws.com/uploads.staging.mlmodelscope.org/birdy.png",
+ alt: "bird"
+ },
+ {
+ src: "https://s3.amazonaws.com/uploads.staging.mlmodelscope.org/kitty.png",
+ alt: "cat"
+ },
+ {
+ src: "https://s3.amazonaws.com/uploads.staging.mlmodelscope.org/crabby.png",
+ alt: "crab"
+ }
];
\ No newline at end of file
diff --git a/src/resources/icons/icon-imageToText.svg b/src/resources/icons/icon-imageToText.svg
new file mode 100644
index 00000000..0d1cf45e
--- /dev/null
+++ b/src/resources/icons/icon-imageToText.svg
@@ -0,0 +1 @@
+
\ No newline at end of file