This repository has been archived by the owner on Jan 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathocr.php
71 lines (55 loc) · 2.87 KB
/
ocr.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
<?php
include_once 'data.php';
include_once 'functions.php';
session_write_close();
database_connect(IL_DATABASE_PATH, 'library');
$file_query = $dbHandle->quote(intval($_GET['file']));
$result = $dbHandle->query("SELECT file FROM library WHERE id=$file_query LIMIT 1");
$file = $result->fetchColumn();
$dbHandle = null;
if (is_file(IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($file) . DIRECTORY_SEPARATOR . $file)) {
exec(select_ghostscript() . ' -dSAFER -dBATCH -dNOPAUSE -sDEVICE=bmp16m -r300 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dDOINTERPOLATE -o "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '.%03d.bmp" ' . escapeshellarg(IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($file) . DIRECTORY_SEPARATOR . $file));
$file_arr = glob(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '*.bmp');
if (is_array($file_arr)) {
set_time_limit(600);
for ($i = 0; $i < count($file_arr); $i++) {
exec(select_tesseract() . ' ' . escapeshellarg($file_arr[$i]) . ' ' . escapeshellarg(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '.' . $i));
if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '.' . $i . '.txt')) {
file_put_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . 'final.txt', file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '.' . $i . '.txt'), FILE_APPEND);
unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . '.' . $i . '.txt');
unlink($file_arr[$i]);
} else {
die('OCR software not functional.');
}
}
$string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . 'final.txt');
unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file . 'final.txt');
$string = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $string);
$string = trim($string);
if (!empty($string)) {
$order = array("\r\n", "\n", "\r");
$string = str_replace($order, ' ', $string);
$string = preg_replace('/\s{2,}/ui', ' ', $string);
$output = null;
database_connect(IL_DATABASE_PATH, 'fulltext');
$file_query = $dbHandle->quote(intval($_GET['file']));
$fulltext_query = $dbHandle->quote($string);
$dbHandle->beginTransaction();
$dbHandle->exec("DELETE FROM full_text WHERE fileID=$file_query");
$output = $dbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES ($file_query,$fulltext_query)");
$dbHandle->commit();
$dbHandle = null;
if (!$output)
$answer = 'Database error.';
} else {
$answer = "OCR text extraction failed.";
}
} else {
$answer = "Ghostscipt not functional.";
}
} else {
$answer = "PDF file not found.";
}
if (isset($answer))
echo $answer;
?>