tesseract OCR support for spamfilters

This commit is contained in:
czaks 2016-06-09 11:08:29 +02:00
parent 36d762514c
commit 8a46c7a0d5
4 changed files with 52 additions and 6 deletions

View File

@ -824,6 +824,15 @@
// Set this to true if you're using Linux and you can execute `md5sum` binary. // Set this to true if you're using Linux and you can execute `md5sum` binary.
$config['gnu_md5'] = false; $config['gnu_md5'] = false;
// Use Tesseract OCR to retrieve text from images, so you can use it as a spamfilter.
$config['tesseract_ocr'] = false;
// Tesseract parameters
$config['tesseract_params'] = '';
// Tesseract preprocess command
$config['tesseract_preprocess_command'] = 'convert -monochrome %s -';
// Number of posts in a "View Last X Posts" page // Number of posts in a "View Last X Posts" page
$config['noko50_count'] = 50; $config['noko50_count'] = 50;
// Number of posts a thread needs before it gets a "View Last X Posts" page. // Number of posts a thread needs before it gets a "View Last X Posts" page.
@ -1015,6 +1024,10 @@
// Minify Javascript using http://code.google.com/p/minify/. // Minify Javascript using http://code.google.com/p/minify/.
$config['minify_js'] = false; $config['minify_js'] = false;
// Dispatch thumbnail loading and image configuration with JavaScript. It will need a certain javascript
// code to work.
$config['javascript_image_dispatch'] = false;
/* /*
* ==================== * ====================
* Video embedding * Video embedding

View File

@ -2695,7 +2695,7 @@ function slugify($post) {
elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup']) elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup'])
$slug = $post['body_nomarkup']; $slug = $post['body_nomarkup'];
elseif (isset ($post['body']) && $post['body']) elseif (isset ($post['body']) && $post['body'])
$slug = strip_html($post['body']); $slug = strip_tags($post['body']);
// Fix UTF-8 first // Fix UTF-8 first
$slug = mb_convert_encoding($slug, "UTF-8", "UTF-8"); $slug = mb_convert_encoding($slug, "UTF-8", "UTF-8");

View File

@ -652,14 +652,14 @@ if (isset($_POST['delete'])) {
$post['filehash'] = md5($allhashes); $post['filehash'] = md5($allhashes);
} }
} }
if (!hasPermission($config['mod']['bypass_filters'], $board['uri'])) { if (!hasPermission($config['mod']['bypass_filters'], $board['uri'])) {
require_once 'inc/filters.php'; require_once 'inc/filters.php';
do_filters($post); do_filters($post);
} }
if ($post['has_file']) { if ($post['has_file']) {
foreach ($post['files'] as $key => &$file) { foreach ($post['files'] as $key => &$file) {
if ($file['is_an_image']) { if ($file['is_an_image']) {
if ($config['ie_mime_type_detection'] !== false) { if ($config['ie_mime_type_detection'] !== false) {
@ -787,6 +787,34 @@ if (isset($_POST['delete'])) {
$file['thumbwidth'] = $size[0]; $file['thumbwidth'] = $size[0];
$file['thumbheight'] = $size[1]; $file['thumbheight'] = $size[1];
} }
if ($config['tesseract_ocr']) { // Let's OCR it!
$fname = $file['tmp_name'];
if ($file['height'] > 500 || $file['width'] > 500) {
$fname = $file['thumb'];
}
if ($fname == 'spoiler') { // We don't have that much CPU time, do we?
}
else {
$tmpname = "tmp/tesseract/".rand(0,10000000);
// Preprocess command is an ImageMagick b/w quantization
$error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " .
'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']);
$tmpname .= ".txt";
$value = @file_get_contents($tmpname);
@unlink($tmpname);
if ($value && trim($value)) {
// This one has an effect, that the body is appended to a post body. So you can write a correct
// spamfilter.
$post['body_nomarkup'] .= "<tinyboard ocr image $key>".htmlspecialchars($value)."</tinyboard>";
}
}
}
if (!isset($dont_copy_file) || !$dont_copy_file) { if (!isset($dont_copy_file) || !$dont_copy_file) {
if (isset($file['file_tmp'])) { if (isset($file['file_tmp'])) {
@ -827,6 +855,11 @@ if (isset($_POST['delete'])) {
} }
} }
// Do filters again if OCRing
if ($config['tesseract_ocr'] && !hasPermission($config['mod']['bypass_filters'], $board['uri'])) {
do_filters($post);
}
if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) { if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) {
undoImage($post); undoImage($post);
if ($config['robot_mute']) { if ($config['robot_mute']) {

0
tmp/tesseract/.gitkeep Normal file
View File