Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix for encoding prolems under Linux with barely UTF-8 support and add ctypes support for NLPIR #3

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ NLPIR_wrap.cxx
_NLPIR.so
build/
*.pyc
*.log
8 changes: 4 additions & 4 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

if __name__ == '__main__':

nlpir_init('.', 'UTF-8')
print nlpir_paragraph_process(r'@ICTCLAS张华平博士 应各位ICTCLAS用户的要求,张华平博士提前发布ICTCLAS2013 版本,为了与以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共享平台,从本版本开始,系统名称调整为NLPIR汉语分词系统。')
print
print nlpir_paragraph_process(r'“屌丝”这个嘲讽意味的代词迅速爆红,迎合了大众的心理和趣味。因为你会发现从表面符合屌丝定义的人,到和屌丝属性八竿子打不着的人,都在争相认领这一名号。当人人都在忙着确认自己的屌丝身份,并乐此不疲时,屌丝一词一定与时代的什么特征实现了合拍。“屌丝”不是阿Q,他们公然比惨并乐在其中有评论认为,“屌丝”是新时代的阿Q,两者并不完全相同。首先,阿Q是文学巨匠鲁迅一己之力创造的,而“屌丝”则是网络群体狂欢的结果,它是真正由网民集体创作的形象;另外,阿Q最重要的特征是“精神胜利法”,梦想的是“银盔银甲”,意淫的是“我手持钢鞭将你打”。', True)
nlpir_init('.', 'UTF-8')
print nlpir_paragraph_process(u'@ICTCLAS张华平博士 应各位ICTCLAS用户的要求,张华平博士提前发布ICTCLAS2013 版本,为了与以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共享平台,从本版本开始,系统名称调整为NLPIR汉语分词系统。'.encode('gbk'))
print
print nlpir_paragraph_process(u'“屌丝”这个嘲讽意味的代词迅速爆红,迎合了大众的心理和趣味。因为你会发现从表面符合屌丝定义的人,到和屌丝属性八竿子打不着的人,都在争相认领这一名号。当人人都在忙着确认自己的屌丝身份,并乐此不疲时,屌丝一词一定与时代的什么特征实现了合拍。“屌丝”不是阿Q,他们公然比惨并乐在其中有评论认为,“屌丝”是新时代的阿Q,两者并不完全相同。首先,阿Q是文学巨匠鲁迅一己之力创造的,而“屌丝”则是网络群体狂欢的结果,它是真正由网民集体创作的形象;另外,阿Q最重要的特征是“精神胜利法”,梦想的是“银盔银甲”,意淫的是“我手持钢鞭将你打”。'.encode('gbk'), True)
nlpir_exit()
49 changes: 49 additions & 0 deletions demo_linux_utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*-coding: utf-8 -*-

from PyNLPIR import *
import sys
import locale

reload(sys)
sys.setdefaultencoding('UTF-8')

def p(f):
print '%s.%s(): %s' % (f.__module__, f.__name__, f())

# 返回使用UCS-2还是UCS-4
print sys.maxunicode

# 检查标准输出流的编码
print sys.stdout.encoding

# 返回当前系统所使用的默认字符编码
p(sys.getdefaultencoding)

# 返回用于转换Unicode文件名至系统文件名所使用的编码
p(sys.getfilesystemencoding)

# 获取默认的区域设置并返回元祖(语言, 编码)
p(locale.getdefaultlocale)

# 返回用户设定的文本数据编码
# 文档提到this function only returns a guess
p(locale.getpreferredencoding)

if __name__ == '__main__':

nlpir_init('.', 'UTF-8')

firstTest = nlpir_paragraph_process(u'你好中国,我亲爱的祖国!GBK, GB2312, GB18030是中文的三种字符集,UCS是万国字符集!'.encode('gbk'))
print type(firstTest)
print repr(firstTest)
print firstTest

print

secondTest = nlpir_paragraph_process(u'编码真是但疼啊!'.encode('gbk'), True)
print type(secondTest)
print repr(secondTest)
print secondTest

nlpir_exit()
23 changes: 23 additions & 0 deletions nlpir-ctypes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
nlpir-ctypes
============

### Idea
This approach is inspired by this link [Python下调用NLPIR(ICTCLAS2013)的ctype做法](http://ictclas.nlpir.org/newsDetail?DocId=382) which demonstrates the Win32 platform.

### How to hack
I made it run for Linux32 platform and as versions go by, the function symbols may need to be changed with newer `libNLPIR.so/NLPIR.dll`.

You may hack follow this:

* replace `nlpir = CDLL(the_library_path)` in `nlpir-ctypes.py`
* using a right tool to dump the exported NLPIR function symbols(For Linux, `nm` or `objdump` maybe. And Windows you may try `dumpbin` or [DLL Exp](http://www.nirsoft.net/utils/dll_export_viewer.html))
* update the function symbols within the `getattr` invoke
* check the function parameters to make `fillprototype` right if necessary
* you are done:-)

### What's next
As you can see, it should not be difficult to add linux64 and win64 support:-)

**Note**

Please use the right python interpreter, for `*.so` under Linux and for `*.dll` under Windows, or `CDLL` will fail.
92 changes: 92 additions & 0 deletions nlpir-ctypes/linux32/nlpir-ctypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

from ctypes import *
import codecs
import sys

reload(sys)
sys.setdefaultencoding('UTF-8')
print sys.getdefaultencoding()

nlpir = CDLL('../../linux32/libNLPIR.so')

print type(nlpir)
print nlpir



def fillprototype(f, restype, argtypes):
f.restype = restype
f.argtypes = argtypes

MY_NLPIR_Init = getattr(nlpir, '_Z10NLPIR_InitPKciS0_')
MY_NLPIR_Exit = getattr(nlpir, '_Z10NLPIR_Exitv')
MY_NLPIR_ParagraphProcess = getattr(nlpir, '_Z22NLPIR_ParagraphProcessPKci')
MY_NLPIR_ImportUserDict = getattr(nlpir, '_Z20NLPIR_ImportUserDictPKc')
MY_NLPIR_FileProcess = getattr(nlpir, '_ZN6CNLPIR11FileProcessEPKcS1_i')
MY_NLPIR_AddUserWord = getattr(nlpir, '_Z17NLPIR_AddUserWordPKc')
MY_NLPIR_SaveTheUsrDic = getattr(nlpir, '_Z19NLPIR_SaveTheUsrDicv')
MY_NLPIR_DelUsrWord = getattr(nlpir, '_Z16NLPIR_DelUsrWordPKc')
MY_NLPIR_GetKeyWords = getattr(nlpir, '_Z17NLPIR_GetKeyWordsPKcib')
MY_NLPIR_GetFileKeyWords = getattr(nlpir, '_Z21NLPIR_GetFileKeyWordsPKcib')
MY_NLPIR_GetNewWords = getattr(nlpir, '_Z17NLPIR_GetNewWordsPKcib')
MY_NLPIR_GetFileNewWords = getattr(nlpir, '_ZN6CNLPIR15GetFileNewWordsEPKcib')
MY_NLPIR_SetPOSmap = getattr(nlpir, '_Z15NLPIR_SetPOSmapi')
MY_NLPIR_FingerPrint = getattr(nlpir, '_Z17NLPIR_FingerPrintPKc')
# New Word Identification
MY_NLPIR_NWI_Start = getattr(nlpir, '_Z15NLPIR_NWI_Startv')
MY_NLPIR_NWI_AddFile = getattr(nlpir, '_Z17NLPIR_NWI_AddFilePKc')
MY_NLPIR_NWI_AddMem = getattr(nlpir, '_Z16NLPIR_NWI_AddMemPKc')
MY_NLPIR_NWI_Complete = getattr(nlpir, '_Z18NLPIR_NWI_Completev')
MY_NLPIR_NWI_GetResult = getattr(nlpir, '_Z19NLPIR_NWI_GetResultb')
MY_NLPIR_NWI_Result2UserDict = getattr(nlpir, '_Z25NLPIR_NWI_Result2UserDictv')

fillprototype(MY_NLPIR_Init, c_bool, [c_char_p, c_int])
fillprototype(MY_NLPIR_Exit, c_bool, None)
fillprototype(MY_NLPIR_ParagraphProcess, c_char_p, [c_char_p, c_int])
fillprototype(MY_NLPIR_ImportUserDict, c_uint, [c_char_p])
fillprototype(MY_NLPIR_FileProcess, c_double, [c_char_p, c_char_p, c_int])
fillprototype(MY_NLPIR_AddUserWord, c_int, [c_char_p])
fillprototype(MY_NLPIR_SaveTheUsrDic, c_int, None)
fillprototype(MY_NLPIR_DelUsrWord, c_int, [c_char_p])
fillprototype(MY_NLPIR_GetKeyWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetFileKeyWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetNewWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetFileNewWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_SetPOSmap, c_int, [c_int])
fillprototype(MY_NLPIR_FingerPrint, c_ulong, [c_char_p])
# New Word Identification
fillprototype(MY_NLPIR_NWI_Start, c_bool, None)
fillprototype(MY_NLPIR_NWI_AddFile, c_bool, [c_char_p])
fillprototype(MY_NLPIR_NWI_AddMem, c_bool, [c_char_p])
fillprototype(MY_NLPIR_NWI_Complete, c_bool, None)
fillprototype(MY_NLPIR_NWI_GetResult, c_char_p, [c_int])
fillprototype(MY_NLPIR_NWI_Result2UserDict, c_uint, None)

look_gb = codecs.lookup('gb2312')
look_utf = codecs.lookup('utf-8')

if not MY_NLPIR_Init('../../', 1):
print 'NLPIR Initial failed!'
exit()

sentence = u"我爱我的祖国,亲爱的祖国!"
print type(sentence)
print sentence

result = MY_NLPIR_ParagraphProcess(sentence.encode('gb2312'), c_int(1))
print result

result_unicode = look_utf.decode(result)[0]
print result_unicode

result_gb2312 = look_gb.encode(result_unicode)[0]
print result_gb2312

result_gbk = look_gb.decode(result_gb2312)[0]
print result_gbk

MY_NLPIR_Exit()

print 'Goodbye!'
92 changes: 92 additions & 0 deletions nlpir-ctypes/win32/nlpir-ctypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

from ctypes import *
import codecs
import sys

reload(sys)
sys.setdefaultencoding('UTF-8')
print sys.getdefaultencoding()

nlpir = CDLL('../../win32/NLPIR.dll')

print type(nlpir)
print nlpir



def fillprototype(f, restype, argtypes):
f.restype = restype
f.argtypes = argtypes

MY_NLPIR_Init = getattr(dll, '?NLPIR_Init@@YA_NPBDH@Z')
MY_NLPIR_Exit = getattr(dll, '?NLPIR_Exit@@YA_NXZ')
MY_NLPIR_ParagraphProcess = getattr(dll, '?NLPIR_ParagraphProcess@@YAPBDPBDH@Z')
MY_NLPIR_ImportUserDict = getattr(dll, '?NLPIR_ImportUserDict@@YAIPBD@Z')
MY_NLPIR_FileProcess = getattr(dll, '?NLPIR_FileProcess@@YANPBD0H@Z')
MY_NLPIR_AddUserWord = getattr(dll, '?NLPIR_AddUserWord@@YAHPBD@Z')
MY_NLPIR_SaveTheUsrDic = getattr(dll, '?NLPIR_SaveTheUsrDic@@YAHXZ')
MY_NLPIR_DelUsrWord = getattr(dll, '?NLPIR_DelUsrWord@@YAHPBD@Z')
MY_NLPIR_GetKeyWords = getattr(dll, '?NLPIR_GetKeyWords@@YAPBDPBDH_N@Z')
MY_NLPIR_GetFileKeyWords = getattr(dll, '?NLPIR_GetFileKeyWords@@YAPBDPBDH_N@Z')
MY_NLPIR_GetNewWords = getattr(dll, '?NLPIR_GetNewWords@@YAPBDPBDH_N@Z')
MY_NLPIR_GetFileNewWords = getattr(dll, '?NLPIR_GetFileNewWords@@YAPBDPBDH_N@Z')
MY_NLPIR_SetPOSmap = getattr(dll, '?NLPIR_SetPOSmap@@YAHH@Z')
MY_NLPIR_FingerPrint = getattr(dll, '?NLPIR_FingerPrint@@YAKPBD@Z')
# New Word Identification
MY_NLPIR_NWI_Start = getattr(dll, '?NLPIR_NWI_Start@@YA_NXZ')
MY_NLPIR_NWI_AddFile = getattr(dll, '?NLPIR_NWI_AddFile@@YAHPBD@Z')
MY_NLPIR_NWI_AddMem = getattr(dll, '?NLPIR_NWI_AddMem@@YA_NPBD@Z')
MY_NLPIR_NWI_Complete = getattr(dll, '?NLPIR_NWI_Complete@@YA_NXZ')
MY_NLPIR_NWI_GetResult = getattr(dll, '?NLPIR_NWI_GetResult@@YAPBD_N@Z')
MY_NLPIR_NWI_Result2UserDict = getattr(dll, '?NLPIR_NWI_Result2UserDict@@YAIXZ')

fillprototype(MY_NLPIR_Init, c_bool, [c_char_p, c_int])
fillprototype(MY_NLPIR_Exit, c_bool, None)
fillprototype(MY_NLPIR_ParagraphProcess, c_char_p, [c_char_p, c_int])
fillprototype(MY_NLPIR_ImportUserDict, c_uint, [c_char_p])
fillprototype(MY_NLPIR_FileProcess, c_double, [c_char_p, c_char_p, c_int])
fillprototype(MY_NLPIR_AddUserWord, c_int, [c_char_p])
fillprototype(MY_NLPIR_SaveTheUsrDic, c_int, None)
fillprototype(MY_NLPIR_DelUsrWord, c_int, [c_char_p])
fillprototype(MY_NLPIR_GetKeyWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetFileKeyWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetNewWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_GetFileNewWords, c_char_p, [c_char_p, c_int, c_bool])
fillprototype(MY_NLPIR_SetPOSmap, c_int, [c_int])
fillprototype(MY_NLPIR_FingerPrint, c_ulong, [c_char_p])
# New Word Identification
fillprototype(MY_NLPIR_NWI_Start, c_bool, None)
fillprototype(MY_NLPIR_NWI_AddFile, c_bool, [c_char_p])
fillprototype(MY_NLPIR_NWI_AddMem, c_bool, [c_char_p])
fillprototype(MY_NLPIR_NWI_Complete, c_bool, None)
fillprototype(MY_NLPIR_NWI_GetResult, c_char_p, [c_int])
fillprototype(MY_NLPIR_NWI_Result2UserDict, c_uint, None)

look_gb = codecs.lookup('gb2312')
look_utf = codecs.lookup('utf-8')

if not MY_NLPIR_Init('../../', 1):
print 'NLPIR Initial failed!'
exit()

sentence = u"我爱我的祖国,亲爱的祖国!"
print type(sentence)
print sentence

result = MY_NLPIR_ParagraphProcess(sentence.encode('gb2312'), c_int(1))
print result

result_unicode = look_utf.decode(result)[0]
print result_unicode

result_gb2312 = look_gb.encode(result_unicode)[0]
print result_gb2312

result_gbk = look_gb.decode(result_gb2312)[0]
print result_gbk

MY_NLPIR_Exit()

print 'Goodbye!'