Skip to content

Commit

Permalink
Merge pull request #6 from juanantoniodelgado/bundle
Browse files Browse the repository at this point in the history
Added Thai and Tagalog support
  • Loading branch information
juanantoniodelgado authored May 30, 2021
2 parents 3e8824f + b65715c commit f00018b
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 1 deletion.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ PHP StopWords removal library with support for multiple languages.
$stopwords->clean('your text to clean');

## Supported languages
Arabic, Armenian, Basque, Bulgarian, Catalan, Cebuano, Chinese, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi, Hungarian, Indonesian, Italian, Japanese, Latvian, Malay, Norwegian, Persian, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Turkish, Ukrainian, and Vietnamese.
Arabic, Armenian, Basque, Bulgarian, Catalan, Cebuano, Chinese, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi, Hungarian, Indonesian, Italian, Japanese, Latvian, Malay, Norwegian, Persian, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, and Vietnamese.

### Notes
Language files are set according to [ISO 639-2][standard].
Expand Down Expand Up @@ -57,6 +57,8 @@ Russian | https://raw.githubusercontent.com/Alir3z4/stop-words/master/russian
Slovak | https://github.com/Alir3z4/stop-words/blob/master/slovak.txt
Spanish | http://www.ranks.nl/stopwords/spanish http://snowball.tartarus.org/algorithms/spanish/stop.txt https://github.com/Alir3z4/stop-words/blob/master/spanish.txt
Swedish | https://raw.githubusercontent.com/Alir3z4/stop-words/master/swedish.txt
Tagalog | https://github.com/stopwords-iso/stopwords-tl
Thai | https://github.com/stopwords-iso/stopwords-th
Turkish | https://raw.githubusercontent.com/Alir3z4/stop-words/master/turkish.txt
Ukrainian | https://raw.githubusercontent.com/Alir3z4/stop-words/master/ukrainian.txt
Vietnamese | https://github.com/Alir3z4/stop-words/blob/master/vietnamese.txt
Expand Down
157 changes: 157 additions & 0 deletions src/words/tagalog.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
{
"name": "Tagalog",
"handlers": [
"tagalog",
"tgl",
"tl"
],
"words": [
"akin",
"aking",
"ako",
"alin",
"am",
"amin",
"aming",
"ang",
"ano",
"anumang",
"apat",
"at",
"atin",
"ating",
"ay",
"bababa",
"bago",
"bakit",
"bawat",
"bilang",
"dahil",
"dalawa",
"dapat",
"din",
"dito",
"doon",
"gagawin",
"gayunman",
"ginagawa",
"ginawa",
"ginawang",
"gumawa",
"gusto",
"habang",
"hanggang",
"hindi",
"huwag",
"iba",
"ibaba",
"ibabaw",
"ibig",
"ikaw",
"ilagay",
"ilalim",
"ilan",
"inyong",
"isa",
"isang",
"itaas",
"ito",
"iyo",
"iyon",
"iyong",
"ka",
"kahit",
"kailangan",
"kailanman",
"kami",
"kanila",
"kanilang",
"kanino",
"kanya",
"kanyang",
"kapag",
"kapwa",
"karamihan",
"katiyakan",
"katulad",
"kaya",
"kaysa",
"ko",
"kong",
"kulang",
"kumuha",
"kung",
"laban",
"lahat",
"lamang",
"likod",
"lima",
"maaari",
"maaaring",
"maging",
"mahusay",
"makita",
"marami",
"marapat",
"masyado",
"may",
"mayroon",
"mga",
"minsan",
"mismo",
"mula",
"muli",
"na",
"nabanggit",
"naging",
"nagkaroon",
"nais",
"nakita",
"namin",
"napaka",
"narito",
"nasaan",
"ng",
"ngayon",
"ni",
"nila",
"nilang",
"nito",
"niya",
"niyang",
"noon",
"o",
"pa",
"paano",
"pababa",
"paggawa",
"pagitan",
"pagkakaroon",
"pagkatapos",
"palabas",
"pamamagitan",
"panahon",
"pangalawa",
"para",
"paraan",
"pareho",
"pataas",
"pero",
"pumunta",
"pumupunta",
"sa",
"saan",
"sabi",
"sabihin",
"sarili",
"sila",
"sino",
"siya",
"tatlo",
"tayo",
"tulad",
"tungkol",
"una",
"walang"
]
}
125 changes: 125 additions & 0 deletions src/words/thai.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
{
"name": "Thai",
"handlers": [
"thai",
"tha",
"th"
],
"words": [
"กล่าว",
"กว่า",
"กัน",
"กับ",
"การ",
"ก็",
"ก่อน",
"ขณะ",
"ขอ",
"ของ",
"ขึ้น",
"คง",
"ครั้ง",
"ความ",
"คือ",
"จะ",
"จัด",
"จาก",
"จึง",
"ช่วง",
"ซึ่ง",
"ดัง",
"ด้วย",
"ด้าน",
"ตั้ง",
"ตั้งแต่",
"ตาม",
"ต่อ",
"ต่าง",
"ต่างๆ",
"ต้อง",
"ถึง",
"ถูก",
"ถ้า",
"ทั้ง",
"ทั้งนี้",
"ทาง",
"ทำ",
"ทำให้",
"ที่",
"ที่สุด",
"ทุก",
"นอกจาก",
"นัก",
"นั้น",
"นำ",
"นี้",
"น่า",
"บาง",
"ผล",
"ผ่าน",
"พบ",
"พร้อม",
"มา",
"มาก",
"มี",
"ยัง",
"รวม",
"ระหว่าง",
"รับ",
"ราย",
"ร่วม",
"ลง",
"วัน",
"ว่า",
"สำหรับ",
"สุด",
"ส่ง",
"ส่วน",
"หนึ่ง",
"หรือ",
"หลัง",
"หลังจาก",
"หลาย",
"หาก",
"อยาก",
"อยู่",
"อย่าง",
"ออก",
"อะไร",
"อาจ",
"อีก",
"เขา",
"เข้า",
"เคย",
"เฉพาะ",
"เช่น",
"เดียว",
"เดียวกัน",
"เนื่องจาก",
"เปิด",
"เปิดเผย",
"เป็น",
"เป็นการ",
"เพราะ",
"เพื่อ",
"เมื่อ",
"เรา",
"เริ่ม",
"เลย",
"เห็น",
"เอง",
"แต่",
"แบบ",
"แรก",
"และ",
"แล้ว",
"แห่ง",
"โดย",
"ใน",
"ให้",
"ได้",
"ไป",
"ไม่",
"ไว้"
]
}

0 comments on commit f00018b

Please sign in to comment.