-
Notifications
You must be signed in to change notification settings - Fork 2
/
Cleaning_data.py
36 lines (26 loc) · 1.13 KB
/
Cleaning_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import re
from os import listdir
from os.path import isfile, join
from pyarabic import araby
def clean(text):
text = text.replace("<br/>", " ")
strip_special_chars = re.compile(u'[^\u0621-\u064a ]')
return re.sub(strip_special_chars, " ", text)
def process(text):
text = araby.strip_tashkeel(text) #delete *tashkil
text = re.sub('\ـ+', ' ', text) # delete letter madda
text = re.sub('\ر+', 'ر', text) # duplicate ra2
text = re.sub('\اا+','ا',text) #duplicate alif
text = re.sub('\ووو+','و',text) #duplicate waw (more than 3 times goes to 1
text = re.sub('\ههه+','ههه',text) #duplicate ha2 (more than 3 times goes to 1
text = re.sub('\ةة+','ة',text)
text = re.sub('\ييي+','ي',text)
text = re.sub('أ','ا',text) # after to avoid mixing
text = re.sub('آ','ا',text) # after to avoid mixing
text = re.sub('إ','ا',text) # after to avoid mixing
text = re.sub('ة','ه',text) # after ةة to avoid mixing ههه
text = re.sub('ى','ي',text)
text = " ".join(text.split()) #delete multispace
return text