Skip to content

Commit 4ab3502

Browse files
committed
Fix a bug with Harakat breaking the reshaping
- Write two small unit tests, more to come - Move letters and ligatures to separate files for readability - Move package to its own folder for readability
1 parent 761e686 commit 4ab3502

12 files changed

+575
-481
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ venv/
4545

4646
# ignore
4747
.ignore/
48+
.DS_Store

.travis.yml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# https://travis-ci.org/mpcabd/python-arabic-reshaper
2+
language: python
3+
python:
4+
- "2.7"
5+
- "3.6"
6+
install:
7+
- "pip install -e ."
8+
script:
9+
- "python setup.py test"

MANIFEST.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
include default-config.ini
2-
include arabic_reshaper.py
1+
include arabic_reshaper/default-config.ini
2+
include arabic_reshaper/arabic_reshaper.py
33
include README
File renamed without changes.

arabic_reshaper/arabic_reshaper.py

+268
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# This work is licensed under the GNU Public License (GPL).
4+
# To view a copy of this license, visit http://www.gnu.org/copyleft/gpl.html
5+
6+
# Written by Abdullah Diab (mpcabd)
7+
# Email: mpcabd@gmail.com
8+
# Website: http://mpcabd.xyz
9+
10+
# Ported and tweaked from Java to Python, from Better Arabic Reshaper
11+
# [https://github.com/agawish/Better-Arabic-Reshaper/]
12+
13+
# Usage:
14+
# Install python-bidi [https://github.com/MeirKriheli/python-bidi], can be
15+
# installed from pip `pip install python-bidi`.
16+
17+
# import arabic_reshaper
18+
# from bidi.algorithm import get_display
19+
# reshaped_text = arabic_reshaper.reshape('اللغة العربية رائعة')
20+
# bidi_text = get_display(reshaped_text)
21+
# Now you can pass `bidi_text` to any function that handles
22+
# displaying/printing of the text, like writing it to PIL Image or passing it
23+
# to a PDF generating method.
24+
25+
from __future__ import unicode_literals
26+
from builtins import range
27+
28+
import re
29+
import os
30+
31+
from configparser import ConfigParser
32+
from itertools import repeat
33+
from pkg_resources import resource_filename
34+
35+
from .ligatures import *
36+
from .letters import *
37+
38+
HARAKAT_RE = re.compile(
39+
'['
40+
'\u0610-\u061a'
41+
'\u064b-\u065f'
42+
'\u0670'
43+
'\u06d6-\u06dc'
44+
'\u06df-\u06e8'
45+
'\u06ea-\u06ed'
46+
'\u08d4-\u08e1'
47+
'\u08d4-\u08ed'
48+
'\u08e3-\u08ff'
49+
']',
50+
51+
re.UNICODE | re.X
52+
)
53+
54+
55+
class ArabicReshaper(object):
56+
"""
57+
A class for Arabic reshaper, it allows for fine-tune configuration over the
58+
API.
59+
60+
If no configuration is passed to the constructor, the class will check for
61+
an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE`
62+
, if the variable is available, the class will load the file pointed to by
63+
the variable, and will read it as an ini file.
64+
If the variable doesn't exist, the class will load with the default
65+
configuration file :file:`default-config.ini`
66+
67+
Check these links for information on the configuration files format:
68+
69+
* Python 3: https://docs.python.org/3/library/configparser.html
70+
* Python 2: https://docs.python.org/2/library/configparser.html
71+
72+
See the default configuration file :file:`default-config.ini` for details
73+
on how to configure your reshaper.
74+
"""
75+
def __init__(self, configuration=None, configuration_file=None):
76+
super(ArabicReshaper, self).__init__()
77+
78+
configuration_files = [
79+
resource_filename(__name__, 'default-config.ini')
80+
]
81+
82+
if not os.path.exists(configuration_files[0]):
83+
raise Exception(
84+
('Default configuration file {} not found,' +
85+
' check the module installation.').format(
86+
configuration_files[0],
87+
)
88+
)
89+
90+
loaded_from_envvar = False
91+
92+
if not configuration_file:
93+
configuration_file = os.getenv(
94+
'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE'
95+
)
96+
if configuration_file:
97+
loaded_from_envvar = True
98+
99+
if configuration_file:
100+
if not os.path.exists(configuration_file):
101+
raise Exception(
102+
'Configuration file {} not found{}.'.format(
103+
configuration_file,
104+
loaded_from_envvar and (
105+
' it is set in your environment variable ' +
106+
'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE'
107+
) or ''
108+
)
109+
)
110+
configuration_files.append(configuration_file)
111+
112+
configuration_parser = ConfigParser()
113+
configuration_from_files = configuration_parser.read(
114+
configuration_files
115+
)
116+
117+
if configuration:
118+
configuration_parser.read_dict({
119+
'ArabicReshaper': configuration
120+
})
121+
122+
if 'ArabicReshaper' not in configuration_parser:
123+
raise ValueError(
124+
'Invalid configuration: '
125+
'A section with the name ArabicReshaper was not found'
126+
)
127+
128+
configuration = configuration_parser['ArabicReshaper']
129+
self.configuration = configuration
130+
131+
@property
132+
def _ligatures_re(self):
133+
if not hasattr(self, '__ligatures_re'):
134+
patterns = []
135+
re_group_index_to_ligature_forms = {}
136+
index = 0
137+
FORMS = 1
138+
MATCH = 0
139+
for ligature_record in LIGATURES:
140+
ligature, replacement = ligature_record
141+
if not self.configuration.getboolean(ligature):
142+
continue
143+
re_group_index_to_ligature_forms[index] = replacement[FORMS]
144+
patterns.append('({})'.format(replacement[MATCH]))
145+
index += 1
146+
self._re_group_index_to_ligature_forms = (
147+
re_group_index_to_ligature_forms
148+
)
149+
self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE)
150+
return self.__ligatures_re
151+
152+
def _get_ligature_forms_from_re_group_index(self, group_index):
153+
if not hasattr(self, '_re_group_index_to_ligature_forms'):
154+
self._ligatures_re
155+
return self._re_group_index_to_ligature_forms[group_index]
156+
157+
def reshape(self, text):
158+
if not text:
159+
return ''
160+
161+
output = []
162+
163+
LETTER = 0
164+
FORM = 1
165+
NOT_SUPPORTED = -1
166+
167+
delete_harakat = self.configuration.getboolean('delete_harakat')
168+
positions_harakat = {}
169+
170+
for letter in text:
171+
if HARAKAT_RE.match(letter):
172+
if not delete_harakat:
173+
position = len(output) - 1
174+
if position not in positions_harakat:
175+
positions_harakat[position] = []
176+
positions_harakat[position].append(letter)
177+
elif letter not in LETTERS:
178+
output.append((letter, NOT_SUPPORTED))
179+
elif not output:
180+
output.append((letter, ISOLATED))
181+
else:
182+
previous_letter = output[-1]
183+
if previous_letter[FORM] == NOT_SUPPORTED:
184+
output.append((letter, ISOLATED))
185+
elif not connects_with_letter_before(letter):
186+
output.append((letter, ISOLATED))
187+
elif not connects_with_letter_after(
188+
previous_letter[LETTER]
189+
):
190+
output.append((letter, ISOLATED))
191+
elif (previous_letter[FORM] == FINAL and not
192+
connects_with_letters_before_and_after(
193+
previous_letter[LETTER]
194+
)):
195+
output.append((letter, ISOLATED))
196+
elif previous_letter[FORM] == ISOLATED:
197+
output[-1] = (
198+
previous_letter[LETTER],
199+
INITIAL
200+
)
201+
output.append((letter, FINAL))
202+
# Otherwise, we will change the previous letter to connect
203+
# to the current letter
204+
else:
205+
output[-1] = (
206+
previous_letter[LETTER],
207+
MEDIAL
208+
)
209+
output.append((letter, FINAL))
210+
211+
if self.configuration.getboolean('support_ligatures'):
212+
# Clean text from Harakat to be able to find ligatures
213+
text = HARAKAT_RE.sub('', text)
214+
for match in re.finditer(self._ligatures_re, text):
215+
group_index = next((
216+
i for i, group in enumerate(match.groups()) if group
217+
), -1)
218+
forms = self._get_ligature_forms_from_re_group_index(
219+
group_index
220+
)
221+
a, b = match.span()
222+
a_form = output[a][FORM]
223+
b_form = output[b - 1][FORM]
224+
ligature_form = None
225+
226+
# +-----------+----------+---------+---------+----------+
227+
# | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL |
228+
# +-----------+----------+---------+---------+----------+
229+
# | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED |
230+
# | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED |
231+
# | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL |
232+
# | FINAL | FINAL | MEDIAL | MEDIAL | FINAL |
233+
# +-----------+----------+---------+---------+----------+
234+
235+
if a_form in (ISOLATED, INITIAL):
236+
if b_form in (ISOLATED, FINAL):
237+
ligature_form = ISOLATED
238+
else:
239+
ligature_form = INITIAL
240+
else:
241+
if b_form in (ISOLATED, FINAL):
242+
ligature_form = FINAL
243+
else:
244+
ligature_form = MEDIAL
245+
if not forms[ligature_form]:
246+
continue
247+
output[a] = (forms[ligature_form], NOT_SUPPORTED)
248+
output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a)
249+
250+
result = []
251+
if not delete_harakat and -1 in positions_harakat:
252+
result.extend(positions_harakat[-1])
253+
for i, o in enumerate(output):
254+
if o[LETTER]:
255+
if o[FORM] == NOT_SUPPORTED:
256+
result.append(o[LETTER])
257+
else:
258+
result.append(LETTERS[o[LETTER]][o[FORM]])
259+
260+
if not delete_harakat:
261+
if i in positions_harakat:
262+
result.extend(positions_harakat[i])
263+
264+
return ''.join(result)
265+
266+
267+
default_reshaper = ArabicReshaper()
268+
reshape = default_reshaper.reshape
File renamed without changes.

0 commit comments

Comments
 (0)