-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatmin_assignment.py
144 lines (104 loc) · 5.2 KB
/
datmin_assignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""datmin_assignment.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1MUV5BgkdpLphCNSL_YtV2oc1ACQ3NbZR
"""
import tensorflow as tf
tf.__version__
from google.colab import drive
drive.mount('/content/drive')
!pip install torch
!pip install scikit-learn
!pip install pandas
!pip install transformers
!pip install sentencepiece
"""
Module to train sentiment binary class
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf
# Hyperparameter configuration
EPOCH = 100
BATCH = 32
LEARNING_RATE = 1e-5
MODEL_PATH="models/datamining.h1"
# Baca dataset dari CSV
FILE_PATH = 'datasets/data_mining_jaya.csv'
df = pd.read_csv(FILE_PATH)
# Persiapkan data
train_texts, test_texts, train_labels, test_labels = train_test_split(
df['text'].values,
df['sentiment'].values,
test_size=0.2,
random_state=42
)
# Unduh ALBERT Pre-trained Model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
# Preprocessing Data
MAX_LENGTH = 1000
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
# Ekstrak Array NumPy
train_input_ids = train_encodings['input_ids'].numpy()
train_attention_mask = train_encodings['attention_mask'].numpy()
test_input_ids = test_encodings['input_ids'].numpy()
test_attention_mask = test_encodings['attention_mask'].numpy()
# Konversi label sentimen menjadi bentuk numerik
label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
train_labels_numeric = [label_mapping[label] for label in train_labels]
test_labels_numeric = [label_mapping[label] for label in test_labels]
# Pastikan bahwa label yang dihasilkan sesuai dengan rentang model
train_labels_numeric = [label if label in [0, 1, 2] else 0 for label in train_labels_numeric]
test_labels_numeric = [label if label in [0, 1, 2] else 0 for label in test_labels_numeric]
# Buat tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_mask), train_labels_numeric))
test_dataset = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention_mask), test_labels_numeric))
# Training Model
# pylint: disable=no-member
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) # type: ignore
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) # type: ignore
model.fit(train_dataset.batch(BATCH), epochs=EPOCH) # type: ignore
# Evaluasi Model
eval_results = model.evaluate(test_dataset.batch(BATCH)) # type: ignore
print("Test loss:", eval_results[0])
print("Test accuracy:", eval_results[1])
# Prediksi dengan Model yang Telah Dilatih
new_texts = ['The foods are awesome', 'Nice to meet you', 'I really dont like it']
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()
predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
# Mengambil logits dari TFSequenceClassifierOutput
logits = predictions.logits
# Mengambil prediksi sentimen
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print("Predicted sentiments:", predicted_sentiments)
model.save_weights(MODEL_PATH) # type: ignore
"""Module providing a train pipelines for sentiment analysis"""
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf
# Download ALBERT Pre-trained Model
# label_mapping = {'Positive': 0, 'Negative': 1}
label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
MAX_LENGTH = 1000
NUM_LABELS = 6 # Adjust num_labels based on the number of sentiments
MODEL_PATH = 'models/food-sentiment-reviews-10122024'
BASE_PRETRAINED_MODEL='albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)
model.load_weights(MODEL_PATH) # type: ignore
new_texts = ['recommend this place', 'I dont like it',]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()
# Mengambil logits dari TFSequenceClassifierOutput dan lakukan predictions
predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print(f'Predicted sentiments: {predicted_sentiments}')