Skip to content

Commit 113bd51

Browse files
committed
Init project and update README.md
1 parent 98b19d6 commit 113bd51

File tree

6 files changed

+608
-0
lines changed

6 files changed

+608
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.idea
2+
composer.lock
3+
composer.phar
4+
/exemple/treeTagger
5+
/vendor/

README.md

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# php-lemmatizer
2+
3+
[![Minimum PHP Version](https://img.shields.io/badge/php-%3E%3D%207.2-8892BF.svg)](https://php.net/)
4+
[![Latest Stable Version](https://img.shields.io/packagist/v/mbeurel/php-lemmatizer.svg)](https://packagist.org/packages/mbeurel/php-lemmatizer)
5+
[![Total Downloads](https://poser.pugx.org/mbeurel/php-lemmatizer/downloads.svg)](https://packagist.org/packages/mbeurel/php-lemmatizer)
6+
[![License](https://poser.pugx.org/mbeurel/php-lemmatizer/license.svg)](https://packagist.org/packages/mbeurel/php-lemmatizer)
7+
8+
A simple lemmatizer tool based on [TreeTagger](https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/) for PHP.
9+
10+
## Installation TreeTagger library
11+
12+
View TreeTagger [WebSite](https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/)
13+
14+
## Install php-lemmatizer
15+
16+
You can install it with Composer:
17+
18+
```
19+
composer require mbeurel/php-lemmatizer
20+
```
21+
22+
## Examples
23+
24+
Example scripts are available ina separate repository [php-lemmatizer/examples](https://github.com/mbeurel/php-lemmatizer/tree/master/exemple).
25+
26+
## Sample Code
27+
```php
28+
include "vendor/autoload.php";
29+
use PhpTreeTagger\TreeTagger;
30+
$treeTaggerPath = __DIR__."/treeTagger"; // Library TreeTagger path
31+
32+
try {
33+
34+
// Init library
35+
$treeTagger = new TreeTagger("french", array(
36+
"treeTaggerPath" => $treeTaggerPath, // Path to TreeTagger Library
37+
"debug" => false, // View Debug
38+
"wordUnique" => true, // Keep only one occurrence of the word
39+
"wordRemoveAccent" => true, // Remove all accent in word
40+
"nbProcess" => $nbProcess // Number of processes executed at the same time
41+
)
42+
);
43+
44+
// Remove type in words
45+
$treeTagger->setCleanTypeWords(
46+
array(
47+
"PRO:PER",
48+
"DET:ART",
49+
"DET:POS",
50+
"SENT",
51+
"PRP"
52+
)
53+
);
54+
55+
// Lemmatizer String or Array parameters, to array => ["La lemmatisation désigne un traitement lexical", "apporté à un texte en vue de son analyse"]
56+
$result = $treeTagger->lemmatizer("La lemmatisation désigne un traitement lexical apporté à un texte en vue de son analyse.");
57+
58+
// View result :
59+
var_dump($result);
60+
61+
// $result = array(
62+
// 0 => array(
63+
// "value" => "lemmatisation designer traitement lexical apporter texte vue analyse",
64+
// "detail" => array(
65+
// 1 => array(
66+
// "source" => "lemmatisation",
67+
// "type" => "NOM",
68+
// "dest" => "lemmatisation"
69+
// ),
70+
// 2 => array(
71+
// "source" => "désigne",
72+
// "type" => "VER:pres",
73+
// "dest" => "désigner"
74+
// ),
75+
// 4 => array(
76+
// "source" => "traitement",
77+
// "type" => "NOM",
78+
// "dest" => "traitement"
79+
// ),
80+
// 6 => array(
81+
// "source" => "apporté",
82+
// "type" => "VER:pper",
83+
// "dest" => "apporter"
84+
// ),
85+
// 7 => array(
86+
// "source" => "à",
87+
// "type" => "PRP",
88+
// "dest" => "à"
89+
// ),
90+
// 9 => array(
91+
// "source" => "texte",
92+
// "type" => "NOM",
93+
// "dest" => "texte"
94+
// ),
95+
// 10 => array(
96+
// "source" => "en",
97+
// "type" => "PRP",
98+
// "dest" => "en"
99+
// ),
100+
// 11 => array(
101+
// "source" => "vue",
102+
// "type" => "NOM",
103+
// "dest" => "vue"
104+
// ),
105+
// 12 => array(
106+
// "source" => "de",
107+
// "type" => "PRP",
108+
// "dest" => "de"
109+
// ),
110+
// 13 => array(
111+
// "source" => "son",
112+
// "type" => "DET:POS",
113+
// "dest" => "son"
114+
// ),
115+
// 14 => array(
116+
// "source" => "analyse",
117+
// "type" => "NOM",
118+
// "dest" => "analyse"
119+
// ),
120+
// 15 => array(
121+
// "source" => ".",
122+
// "type" => "SENT",
123+
// "dest" => "."
124+
// )
125+
// }
126+
// }
127+
// }
128+
} catch(\Exception $e) {
129+
echo $e;
130+
}
131+
```
132+
133+
## Credits
134+
135+
Created by [Matthieu Beurel](https://www.mbeurel.com). Sponsored by [Nexboard](https://www.nexboard.fr).

composer.json

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"name": "mbeurel/php-lemmatizer",
3+
"type": "library",
4+
"description": "PHP Wrapper for the TreeTagger library",
5+
"keywords": [
6+
"TreeTagger",
7+
"PHP",
8+
"lemmatizer"
9+
],
10+
"homepage": "https://github.com/mbeurel/php-lemmatizer",
11+
"license": "MIT",
12+
"authors": [
13+
{
14+
"name": "Matthieu Beurel",
15+
"email": "m.beurel@nexboard.fr",
16+
"homepage": "https://www.nexboard.fr"
17+
},
18+
{
19+
"name": "Clément Meunier",
20+
"email": "c.meunier@nexboard.fr",
21+
"homepage": "https://www.nexboard.fr"
22+
}
23+
],
24+
"require": {
25+
"ext-json": "*",
26+
"php": "^7.1",
27+
"symfony/process": "^4.1"
28+
},
29+
"require-dev": {
30+
"symfony/debug-pack": "^1.0.6"
31+
},
32+
"autoload": {
33+
"psr-4": { "PhpTreeTagger\\": "lib" }
34+
}
35+
}

exemple/exemple.php

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
<?php
2+
include "../vendor/autoload.php";
3+
use PhpTreeTagger\TreeTagger;
4+
$treeTaggerPath = __DIR__."/treeTagger";
5+
$wordLemmatizer = "";
6+
$debug = false;
7+
$uniqueWord = false;
8+
$removeAccent = false;
9+
$nbProcess = 6;
10+
$help = false;
11+
foreach($argv as $key => $value)
12+
{
13+
if(strpos($value, "--help") !== false)
14+
{
15+
$help = true;
16+
}
17+
elseif(strpos($value, "--debug") !== false)
18+
{
19+
$debug = true;
20+
}
21+
elseif(strpos($value, "--uniqueWord") !== false)
22+
{
23+
$uniqueWord = true;
24+
}
25+
elseif(strpos($value, "--removeAccent") !== false)
26+
{
27+
$removeAccent = true;
28+
}
29+
elseif(strpos($value, "--nbProcess") !== false)
30+
{
31+
$nbProcess = (int) str_replace("--nbProcess=", "", $value);
32+
}
33+
elseif(strpos($value, "--word") !== false)
34+
{
35+
$wordLemmatizer = str_replace("--word=", "", $value);
36+
if(strpos($wordLemmatizer, "|") !== false)
37+
{
38+
$wordLemmatizer = explode("|", $wordLemmatizer);
39+
}
40+
}
41+
elseif($key > 0)
42+
{
43+
throw new \Exception("Error : The parameters $value is not defined");
44+
}
45+
}
46+
try {
47+
if(!$wordLemmatizer)
48+
{
49+
throw new \Exception("Error : You have not to filled in lemmatizer-word");
50+
}
51+
$treeTagger = new TreeTagger("french", array(
52+
"treeTaggerPath" => $treeTaggerPath,
53+
"debug" => $debug,
54+
"uniqueWord" => $uniqueWord,
55+
"removeAccent" => $removeAccent,
56+
"nbProcess" => $nbProcess
57+
)
58+
);
59+
// Remove type in words
60+
$treeTagger->setCleanTypeWords(array(
61+
"PRO:PER",
62+
"DET:ART",
63+
"DET:POS",
64+
"SENT",
65+
"PRP"
66+
)
67+
);
68+
var_dump($treeTagger->lemmatizer($wordLemmatizer));
69+
} catch(\Exception $e) {
70+
echo $e;
71+
}
72+
73+

lib/Tools/ToolsTrait.php

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
<?php
2+
3+
4+
namespace PhpTreeTagger\Tools;
5+
6+
7+
trait ToolsTrait
8+
{
9+
10+
/**
11+
* @var array
12+
*/
13+
private static $accentsReplacements = array(
14+
"¥" => "Y", "µ" => "u", "À" => "A", "Á" => "A",
15+
"Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A",
16+
"Æ" => "A", "Ç" => "C", "È" => "E", "É" => "E",
17+
"Ê" => "E", "Ë" => "E", "Ì" => "I", "Í" => "I",
18+
"Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N",
19+
"Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O",
20+
"Ö" => "O", "Ø" => "O", "Ù" => "U", "Ú" => "U",
21+
"Û" => "U", "Ü" => "U", "Ý" => "Y", "ß" => "s",
22+
"à" => "a", "á" => "a", "â" => "a", "ã" => "a",
23+
"ä" => "a", "å" => "a", "æ" => "a", "ç" => "c",
24+
"è" => "e", "é" => "e", "ê" => "e", "ë" => "e",
25+
"ì" => "i", "í" => "i", "î" => "i", "ï" => "i",
26+
"ð" => "o", "ñ" => "n", "ò" => "o", "ó" => "o",
27+
"ô" => "o", "õ" => "o", "ö" => "o", "ø" => "o",
28+
"ù" => "u", "ú" => "u", "û" => "u", "ü" => "u",
29+
"ý" => "y", "ÿ" => "y");
30+
31+
/**
32+
* @param string $texte
33+
*
34+
* @return string
35+
*/
36+
public function removeAccents(string $texte) : string
37+
{
38+
return str_replace('!', '', strtr(trim($texte), self::$accentsReplacements));
39+
}
40+
41+
/**
42+
* get first value of an array
43+
*
44+
* @param $array
45+
* @param null $default
46+
*
47+
* @return array|mixed|null
48+
*/
49+
public static function first($array, $default = null)
50+
{
51+
if(!is_array($array))
52+
{
53+
return $array;
54+
}
55+
if(empty($array))
56+
{
57+
return $default;
58+
}
59+
$a = array_shift($array);
60+
unset($array);
61+
return $a;
62+
}
63+
64+
/**
65+
* @param $data
66+
*
67+
* @return array
68+
*/
69+
private function toArray($data): array
70+
{
71+
return \is_array($data) ? $data : [$data];
72+
}
73+
74+
}

0 commit comments

Comments
 (0)