-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCNN_NLP.py
172 lines (114 loc) · 5.65 KB
/
CNN_NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
import pickle
import os
output_txt_paths = [
'articles_text/1intro.txt',
'articles_text/2relation.txt',
'articles_text/3count.txt',
'articles_text/4probability.txt',
'articles_text/libya.txt',
'articles_text/Homework_3_Fisayo_Ojo.txt',
'articles_text/An Investigation of the Pattern And Environmental Impact of Oil.txt',
'articles_text/Analysis of oil spill impacts along pipelines..txt',
'articles_text/Causes and Terrain of Oil Spillage in Niger Delta.txt',
'articles_text/Deficient legislation sanctioning oil spill.txt',
'articles_text/Effects of Oil Spillage (Pollution) on Agricultural Production in Delta.txt',
'articles_text/Effects of oil spills on fish production in the Niger Delta.txt',
'articles_text/EFFECTS OF OIL SPILLAGE ON FISH IN NIGERIA.txt',
'articles_text/Effects of oil spills on fish production in the Niger Delta.txt',
'articles_text/Environmental Consequences of Oil Spills on Marine Habitats and the Mitigating Measures—The Niger Delta Perspective.txt',
'articles_text/ENVIRONMENTAL IMPACTS OF OIL EXPLORATION.txt',
'articles_text/Evaluation of the Impacts of Oil Spill Disaster on Communities in Niger Delta, Nigeria.txt',
'articles_text/Impacts and Management of Oil Spill Pollution along the Nigerian Coastal Areas.txt',
'articles_text/Impacts of Oil Exploration (Oil and Gas Conflicts; Niger Delta as a Case Study).txt',
'articles_text/Impacts of Oil Production on Nigeria‘s Waters.txt',
'articles_text/NIGERIA OIL POLLUTION, POLITICS AND POLICY.txt',
'articles_text/Oil Pollution in Nigeria and the Issue of Human Rights of the Victims.txt',
'articles_text/Oil Spills and Human Health.txt',
'articles_text/OIL SPILLS IN THE NIGER DELTA.txt',
'articles_text/Press Coverage of Environmental Pollution In The Niger Delta Region of Nigeria.txt',
'articles_text/Shell will sell big piece of its Nigeria oil business, but activists want pollution cleaned up first _ AP News.txt'
]
documents = []
for path in output_txt_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
text = file.read()
documents.append(text)
except FileNotFoundError:
print(f"File {path} not found. Skipping.")
labels = [1 if "oil" in doc.lower() else 0 for doc in documents]
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(documents)
sequences = tokenizer.texts_to_sequences(documents)
max_length = max(len(x) for x in sequences)
padded_docs = pad_sequences(sequences, maxlen=max_length, padding='post')
pairs = []
pair_labels = []
for i in range(len(padded_docs)-1):
for j in range(i+1, len(padded_docs)):
pairs.append([padded_docs[i], padded_docs[j]])
pair_labels.append(int(labels[i] == labels[j]))
import numpy as np
pairs = np.array(pairs)
pair_labels = np.array(pair_labels)
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.models import Model
# Model architecture
def create_cnn_model():
input_1 = Input(shape=(max_length,))
input_2 = Input(shape=(max_length,))
embedding = Embedding(5000, 50)
embedded_1 = embedding(input_1)
embedded_2 = embedding(input_2)
conv1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedded_1)
conv2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedded_2)
pool1 = MaxPooling1D(pool_size=5)(conv1)
pool2 = MaxPooling1D(pool_size=5)(conv2)
flat1 = Flatten()(pool1)
flat2 = Flatten()(pool2)
# Merge layers
merged = concatenate([flat1, flat2])
# Fully connected layers
dense1 = Dense(10, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(dense1)
# Compile model
model = Model(inputs=[input_1, input_2], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model
model = create_cnn_model()
model.summary()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pairs, pair_labels, test_size=0.2, random_state=42)
X_train_1 = np.array([x[0] for x in X_train])
X_train_2 = np.array([x[1] for x in X_train])
X_test_1 = np.array([x[0] for x in X_test])
X_test_2 = np.array([x[1] for x in X_test])
model = create_cnn_model()
history = model.fit([X_train_1, X_train_2], y_train,
validation_data=([X_test_1, X_test_2], y_test),
epochs=10,
batch_size=32)
model.save('model.h5')
test_loss, test_accuracy = model.evaluate([X_test_1, X_test_2], y_test)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
new_documents = [
'articles_text/espionage.txt',
'articles_text/mining.txt',
]
new_docs = []
for path in new_documents:
try:
with open(path, 'r', encoding='utf-8') as file:
text = file.read()
new_docs.append(text)
except FileNotFoundError:
print(f"File {path} not found. Skipping.")
new_sequences = tokenizer.texts_to_sequences(new_docs)
new_padded_docs = pad_sequences(new_sequences, maxlen=max_length, padding='post')
new_pair = np.array([new_padded_docs[0], new_padded_docs[1]]).reshape(1, 2, -1)
new_prediction = model.predict([new_pair[:, 0], new_pair[:, 1]])
related = new_prediction[0][0] > 0.5
print(f"The documents are {'related' if related else 'not related'} with a confidence of {new_prediction[0][0]:.2f}")