diff --git a/core/amber/requirements.txt b/core/amber/requirements.txt index a3b201b6794..05094ee5be2 100644 --- a/core/amber/requirements.txt +++ b/core/amber/requirements.txt @@ -24,4 +24,5 @@ python-lsp-server[all]==1.5.0 python-lsp-server[websockets] bidict==0.22.0 cached_property -psutil \ No newline at end of file +psutil +transformers \ No newline at end of file diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/operators/LogicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/operators/LogicalOp.scala index 016cb9a23f6..15be56f76db 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/operators/LogicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/operators/LogicalOp.scala @@ -34,6 +34,7 @@ import edu.uci.ics.texera.workflow.operators.intersect.IntersectOpDesc import edu.uci.ics.texera.workflow.operators.intervalJoin.IntervalJoinOpDesc import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc import edu.uci.ics.texera.workflow.operators.limit.LimitOpDesc +import edu.uci.ics.texera.workflow.operators.huggingFace.HuggingFaceSentimentAnalysisOpDesc import edu.uci.ics.texera.workflow.operators.projection.ProjectionOpDesc import edu.uci.ics.texera.workflow.operators.randomksampling.RandomKSamplingOpDesc import edu.uci.ics.texera.workflow.operators.regex.RegexOpDesc @@ -180,7 +181,11 @@ trait StateTransferFunc new Type(value = classOf[FunnelPlotOpDesc], name = "FunnelPlot"), new Type(value = classOf[TablesPlotOpDesc], name = "TablesPlot"), new Type(value = classOf[JavaUDFOpDesc], name = "JavaUDF"), - new Type(value = classOf[SortOpDesc], name = "Sort") + new Type(value = classOf[SortOpDesc], name = "Sort"), + new Type( + value = classOf[HuggingFaceSentimentAnalysisOpDesc], + name = "HuggingFaceSentimentAnalysis" + ) ) ) abstract class LogicalOp extends PortDescriptor with Serializable { diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala new file mode 100644 index 00000000000..ac9a109df9d --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala @@ -0,0 +1,96 @@ +package edu.uci.ics.texera.workflow.operators.huggingFace + +import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} +import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort} +import edu.uci.ics.texera.workflow.common.metadata.annotations.AutofillAttributeName +import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.texera.workflow.common.operators.PythonOperatorDescriptor +import edu.uci.ics.texera.workflow.common.tuple.schema.{AttributeType, Schema} + +class HuggingFaceSentimentAnalysisOpDesc extends PythonOperatorDescriptor { + @JsonProperty(value = "attribute", required = true) + @JsonPropertyDescription("column to perform sentiment analysis on") + @AutofillAttributeName + var attribute: String = _ + + @JsonProperty( + value = "Positive result attribute", + required = true, + defaultValue = "huggingface_sentiment_positive" + ) + @JsonPropertyDescription("column name of the sentiment analysis result (positive)") + var resultAttributePositive: String = _ + + @JsonProperty( + value = "Neutral result attribute", + required = true, + defaultValue = "huggingface_sentiment_neutral" + ) + @JsonPropertyDescription("column name of the sentiment analysis result (neutral)") + var resultAttributeNeutral: String = _ + + @JsonProperty( + value = "Negative result attribute", + required = true, + defaultValue = "huggingface_sentiment_negative" + ) + @JsonPropertyDescription("column name of the sentiment analysis result (negative)") + var resultAttributeNegative: String = _ + + override def generatePythonCode(): String = { + s"""from pytexera import * + |from transformers import pipeline + |from transformers import AutoModelForSequenceClassification + |from transformers import TFAutoModelForSequenceClassification + |from transformers import AutoTokenizer, AutoConfig + |import numpy as np + |from scipy.special import softmax + | + |class ProcessTupleOperator(UDFOperatorV2): + | + | def open(self): + | model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" + | self.tokenizer = AutoTokenizer.from_pretrained(model_name) + | self.config = AutoConfig.from_pretrained(model_name) + | self.model = AutoModelForSequenceClassification.from_pretrained(model_name) + | + | @overrides + | def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: + | encoded_input = self.tokenizer(tuple_["$attribute"], return_tensors='pt') + | output = self.model(**encoded_input) + | scores = softmax(output[0][0].detach().numpy()) + | ranking = np.argsort(scores)[::-1] + | labels = {"positive": "$resultAttributePositive", "neutral": "$resultAttributeNeutral", "negative": "$resultAttributeNegative"} + | for i in range(scores.shape[0]): + | label = labels[self.config.id2label[ranking[i]]] + | score = scores[ranking[i]] + | tuple_[label] = np.round(float(score), 4) + | yield tuple_""".stripMargin + } + + override def operatorInfo: OperatorInfo = + OperatorInfo( + "Hugging Face Sentiment Analysis", + "Analyzing Sentiments with a Twitter-Based Model from Hugging Face", + OperatorGroupConstants.MACHINE_LEARNING_GROUP, + inputPorts = List(InputPort()), + outputPorts = List(OutputPort()), + supportReconfiguration = true + ) + + override def getOutputSchema(schemas: Array[Schema]): Schema = { + if ( + resultAttributePositive == null || resultAttributePositive.trim.isEmpty || + resultAttributeNeutral == null || resultAttributeNeutral.trim.isEmpty || + resultAttributeNegative == null || resultAttributeNegative.trim.isEmpty + ) + return null + Schema + .builder() + .add(schemas(0)) + .add(resultAttributePositive, AttributeType.DOUBLE) + .add(resultAttributeNeutral, AttributeType.DOUBLE) + .add(resultAttributeNegative, AttributeType.DOUBLE) + .build() + } +} diff --git a/core/gui/src/assets/operator_images/HuggingFaceSentimentAnalysis.png b/core/gui/src/assets/operator_images/HuggingFaceSentimentAnalysis.png new file mode 100644 index 00000000000..673b8ea9077 Binary files /dev/null and b/core/gui/src/assets/operator_images/HuggingFaceSentimentAnalysis.png differ