Skip to content

Commit

Permalink
Address remaining reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
st3iny committed Nov 15, 2024
1 parent 39a8ce5 commit f4fc5bc
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

use OCA\Mail\Account;
use OCA\Mail\Db\Message;
use OCA\Mail\Service\Classification\ImportanceClassifier;
use Rubix\ML\Datasets\Labeled;
use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Transformers\MultibyteTextNormalizer;
Expand All @@ -21,13 +22,15 @@
use function array_map;

class SubjectExtractor implements IExtractor {
private const MAX_VOCABULARY_SIZE = 500;

private WordCountVectorizer $wordCountVectorizer;
private TfIdfTransformer $tfidf;
private int $max = -1;

public function __construct() {

Check warning on line 31 in lib/Service/Classification/FeatureExtraction/SubjectExtractor.php

View check run for this annotation

Codecov / codecov/patch

lib/Service/Classification/FeatureExtraction/SubjectExtractor.php#L31

Added line #L31 was not covered by tests
// Limit vocabulary to limit memory usage
$this->wordCountVectorizer = new WordCountVectorizer(500);
$this->wordCountVectorizer = new WordCountVectorizer(self::MAX_VOCABULARY_SIZE);
$this->tfidf = new TfIdfTransformer();

Check warning on line 34 in lib/Service/Classification/FeatureExtraction/SubjectExtractor.php

View check run for this annotation

Codecov / codecov/patch

lib/Service/Classification/FeatureExtraction/SubjectExtractor.php#L33-L34

Added lines #L33 - L34 were not covered by tests
}

Expand All @@ -53,15 +56,14 @@ public function prepare(Account $account, array $incomingMailboxes, array $outgo
$data = array_map(static function (Message $message) {
return [
'text' => $message->getSubject() ?? '',
'label' => $message->getFlagImportant() ? 'i' : 'ni',
'label' => $message->getFlagImportant()
? ImportanceClassifier::LABEL_IMPORTANT
: ImportanceClassifier::LABEL_NOT_IMPORTANT,
];
}, $messages);

Check warning on line 63 in lib/Service/Classification/FeatureExtraction/SubjectExtractor.php

View check run for this annotation

Codecov / codecov/patch

lib/Service/Classification/FeatureExtraction/SubjectExtractor.php#L56-L63

Added lines #L56 - L63 were not covered by tests

// Fit transformers
Labeled::build(
array_column($data, 'text'),
array_column($data, 'label'),
)
Labeled::build(array_column($data, 'text'), array_column($data, 'label'))
->apply(new MultibyteTextNormalizer())
->apply($this->wordCountVectorizer)
->apply($this->tfidf);

Check warning on line 69 in lib/Service/Classification/FeatureExtraction/SubjectExtractor.php

View check run for this annotation

Codecov / codecov/patch

lib/Service/Classification/FeatureExtraction/SubjectExtractor.php#L66-L69

Added lines #L66 - L69 were not covered by tests
Expand Down Expand Up @@ -104,6 +106,5 @@ private function limitFeatureSize(): void {
}

$this->max = count($vocabularies[0]);

Check warning on line 108 in lib/Service/Classification/FeatureExtraction/SubjectExtractor.php

View check run for this annotation

Codecov / codecov/patch

lib/Service/Classification/FeatureExtraction/SubjectExtractor.php#L108

Added line #L108 was not covered by tests
echo("WCF vocab size: {$this->max}\n");
}
}
4 changes: 2 additions & 2 deletions lib/Service/Classification/ImportanceClassifier.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ class ImportanceClassifier {
/**
* @var string label for data sets that are classified as important
*/
private const LABEL_IMPORTANT = 'i';
public const LABEL_IMPORTANT = 'i';

/**
* @var string label for data sets that are classified as not important
*/
private const LABEL_NOT_IMPORTANT = 'ni';
public const LABEL_NOT_IMPORTANT = 'ni';

/**
* The minimum number of important messages. Without those the unsupervised
Expand Down

0 comments on commit f4fc5bc

Please sign in to comment.