datmin_assignment.py

# -*- coding: utf-8 -*-
"""datmin_assignment.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1MUV5BgkdpLphCNSL_YtV2oc1ACQ3NbZR
"""

import tensorflow as tf
tf.__version__

from google.colab import drive
drive.mount('/content/drive')

!pip install torch

!pip install scikit-learn

!pip install pandas

!pip install transformers

!pip install sentencepiece

"""
    Module to train sentiment binary class
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Hyperparameter configuration
EPOCH = 100
BATCH = 32
LEARNING_RATE = 1e-5
MODEL_PATH="models/datamining.h1"

# Baca dataset dari CSV
FILE_PATH = 'datasets/data_mining_jaya.csv'
df = pd.read_csv(FILE_PATH)

# Persiapkan data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].values,
    df['sentiment'].values,
    test_size=0.2,
    random_state=42
)

# Unduh ALBERT Pre-trained Model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')

# Preprocessing Data
MAX_LENGTH = 1000

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# Ekstrak Array NumPy
train_input_ids = train_encodings['input_ids'].numpy()
train_attention_mask = train_encodings['attention_mask'].numpy()

test_input_ids = test_encodings['input_ids'].numpy()
test_attention_mask = test_encodings['attention_mask'].numpy()

# Konversi label sentimen menjadi bentuk numerik
label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
train_labels_numeric = [label_mapping[label] for label in train_labels]
test_labels_numeric = [label_mapping[label] for label in test_labels]

# Pastikan bahwa label yang dihasilkan sesuai dengan rentang model
train_labels_numeric = [label if label in [0, 1, 2] else 0 for label in train_labels_numeric]
test_labels_numeric = [label if label in [0, 1, 2] else 0 for label in test_labels_numeric]


# Buat tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_mask), train_labels_numeric))
test_dataset = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention_mask), test_labels_numeric))

# Training Model
# pylint: disable=no-member
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) # type: ignore
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) # type: ignore

model.fit(train_dataset.batch(BATCH), epochs=EPOCH) # type: ignore


# Evaluasi Model
eval_results = model.evaluate(test_dataset.batch(BATCH)) # type: ignore
print("Test loss:", eval_results[0])
print("Test accuracy:", eval_results[1])

# Prediksi dengan Model yang Telah Dilatih
new_texts = ['The foods are awesome', 'Nice to meet you', 'I really dont like it']
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()

predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
# Mengambil logits dari TFSequenceClassifierOutput
logits = predictions.logits

# Mengambil prediksi sentimen
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print("Predicted sentiments:", predicted_sentiments)

model.save_weights(MODEL_PATH) # type: ignore

"""Module providing a train pipelines for sentiment analysis"""

from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Download ALBERT Pre-trained Model
# label_mapping = {'Positive': 0, 'Negative': 1}
label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
MAX_LENGTH = 1000
NUM_LABELS = 6 # Adjust num_labels based on the number of sentiments
MODEL_PATH = 'models/food-sentiment-reviews-10122024'
BASE_PRETRAINED_MODEL='albert-base-v2'


tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)
model.load_weights(MODEL_PATH) # type: ignore

new_texts = ['recommend this place', 'I dont like it',]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()


# Mengambil logits dari TFSequenceClassifierOutput dan lakukan predictions
predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print(f'Predicted sentiments: {predicted_sentiments}')