Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pythainlp.wsd for Thai Word Sense Disambiguation #818

Merged
merged 8 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ ufal.chu-liu-edmonds==1.0.2
wtpsplit==1.0.1
fastcoref==2.1.6
panphon==0.20.0
sentence-transformers==2.2.2
2 changes: 2 additions & 0 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ Modules
.. autofunction:: download
.. autofunction:: remove
.. autofunction:: provinces
.. autofunction:: thai_dict
.. autofunction:: thai_stopwords
.. autofunction:: thai_words
.. autofunction:: thai_wsd_dict
.. autofunction:: thai_orst_words
.. autofunction:: thai_syllables
.. autofunction:: thai_negations
Expand Down
12 changes: 12 additions & 0 deletions docs/api/wsd.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.. currentmodule:: pythainlp.wsd

pythainlp.wsd
=============

The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD).


Modules
-------

.. autofunction:: get_sense
1 change: 1 addition & 0 deletions docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ where ``extras`` can be
- ``transformers_ud`` (to support transformers_ud engine)
- ``dependency_parsing`` (to support dependency parsing with all engine)
- ``coreference_resolution`` (to support coreference esolution with all engine)
- ``wsd`` (to support pythainlp.wsd)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
173 changes: 173 additions & 0 deletions notebooks/test_wsd.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "70e6b5ba-063d-4e53-a312-2380b49bc3a9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from pythainlp.wsd import get_sense"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
]
}
],
"source": [
"print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
]
}
],
"source": [
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from pythainlp.corpus import get_corpus_path, thai_wsd_dict"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0f88ff4c-06db-4cba-8086-4bb2160bead0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"_w=thai_wsd_dict()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "83642893-d9a6-4271-a1b7-5e57638a74d4",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['word', 'meaning'])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_w.keys()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bb67c468-ce65-4581-adc6-832d70cfabab",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_w[\"word\"][0],_w[\"meaning\"][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27fbe522-019f-4157-a9a8-50ae62b50727",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 4 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@
"get_corpus_path",
"provinces",
"remove",
"thai_dict",
"thai_family_names",
"thai_female_names",
"thai_male_names",
"thai_negations",
"thai_stopwords",
"thai_syllables",
"thai_words",
"thai_wsd_dict",
"thai_orst_words",
"path_pythainlp_corpus",
"get_path_folder_corpus",
Expand Down Expand Up @@ -112,4 +114,6 @@ def corpus_db_path() -> str:
thai_syllables,
thai_words,
thai_orst_words,
thai_dict,
thai_wsd_dict
)
55 changes: 54 additions & 1 deletion pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
"thai_stopwords",
"thai_syllables",
"thai_words",
"thai_dict",
"thai_wsd_dict",
]

from typing import FrozenSet, List, Union

from pythainlp.corpus import get_corpus
from pythainlp.corpus import get_corpus, get_corpus_path

_THAI_COUNTRIES = set()
_THAI_COUNTRIES_FILENAME = "countries_th.txt"
Expand Down Expand Up @@ -60,6 +62,9 @@

_THAI_ORST_WORDS = set()

_THAI_DICT = {}
_THAI_WSD_DICT = {}


def countries() -> FrozenSet[str]:
"""
Expand Down Expand Up @@ -256,3 +261,51 @@ def thai_male_names() -> FrozenSet[str]:
_THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME)

return _THAI_MALE_NAMES


def thai_dict() -> dict:
"""
Return Thai dictionary with definition from wiktionary.
\n(See: `thai_dict\
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)

:return: Thai word with part-of-speech type and definition
:rtype: :class:`frozenset`
"""
global _THAI_DICT
if _THAI_DICT == {}:
import csv
_THAI_DICT = {"word":[], "meaning":[]}
with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_DICT["word"].append(row["word"])
_THAI_DICT["meaning"].append(row["meaning"])

return _THAI_DICT


def thai_wsd_dict() -> dict:
"""
Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.
\n(See: `thai_dict\
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)

:return: Thai word with part-of-speech type and definition
:rtype: :class:`frozenset`
"""
global _THAI_WSD_DICT
if _THAI_WSD_DICT == {}:
_thai_wsd = thai_dict()
_THAI_WSD_DICT = {"word":[],"meaning":[]}
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
_all_value = list(eval(j).values())
_use = []
for k in _all_value:
_use.extend(k)
_use=list(set(_use))
if len(_use)>1:
_THAI_WSD_DICT["word"].append(i)
_THAI_WSD_DICT["meaning"].append(_use)

return _THAI_WSD_DICT
19 changes: 19 additions & 0 deletions pythainlp/wsd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Thai Word Sense Disambiguation (WSD)
"""
__all__ = ["get_sense"]
from pythainlp.wsd.core import get_sense
Loading