-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocess.py
30 lines (26 loc) · 788 Bytes
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
import codecs
file_in = codecs.open('DATA.csv', 'r', encoding='ascii')
file_out = open('ise_processed.txt', 'w')
sentence_opened = False
sentence = ''
sentiment = ''
sent_id = 0
available_sentiments = []
for row in file_in:
if not sentence_opened and row.find(',') > 0:
sentence_opened = True
sentiment = row[0:row.find(',')]
row = row[(row.find(',')+1):]
sent_id += 1
if not row.endswith(u'á\n'):
sentence += row
file_out.write('%d---%s---%s'%(sentid,sentiment, sentence))
sentence_opened = False
sentence = ''
available_sentiments.append(sentiment)
else:
sentence += row.replace(u'á\n', '')
file_in.close()
file_out.close()
print set(available_sentiments)