tesseract OCR support for spamfilters

2016-06-09 11:08:29 +02:00 · 2016-06-09 11:08:29 +02:00 · 8a46c7a0d5
commit 8a46c7a0d5
parent 36d762514c
4 changed files with 52 additions and 6 deletions
--- a/inc/config.php
+++ b/inc/config.php
@ -824,6 +824,15 @@
 	// Set this to true if you're using Linux and you can execute `md5sum` binary.
 	$config['gnu_md5'] = false;

+	// Use Tesseract OCR to retrieve text from images, so you can use it as a spamfilter.
+	$config['tesseract_ocr'] = false;
+
+	// Tesseract parameters
+	$config['tesseract_params'] = '';
+
+	// Tesseract preprocess command
+	$config['tesseract_preprocess_command'] = 'convert -monochrome %s -';
+
 	// Number of posts in a "View Last X Posts" page
 	$config['noko50_count'] = 50;
 	// Number of posts a thread needs before it gets a "View Last X Posts" page.
@ -1015,6 +1024,10 @@
 	// Minify Javascript using http://code.google.com/p/minify/.
 	$config['minify_js'] = false;

+	// Dispatch thumbnail loading and image configuration with JavaScript. It will need a certain javascript
+	// code to work.
+	$config['javascript_image_dispatch'] = false;
+
 /*
 * ====================
 *  Video embedding
--- a/inc/functions.php
+++ b/inc/functions.php
@ -2695,7 +2695,7 @@ function slugify($post) {
 	elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup'])
 		$slug = $post['body_nomarkup'];
 	elseif (isset ($post['body']) && $post['body'])
-		$slug = strip_html($post['body']);
+		$slug = strip_tags($post['body']);

 	// Fix UTF-8 first
 	$slug = mb_convert_encoding($slug, "UTF-8", "UTF-8");
--- a/post.php
+++ b/post.php
@ -652,14 +652,14 @@ if (isset($_POST['delete'])) {
 			$post['filehash'] = md5($allhashes);
 		}
 	}
-	
+
 	if (!hasPermission($config['mod']['bypass_filters'], $board['uri'])) {
-		require_once 'inc/filters.php';	
-		
+		require_once 'inc/filters.php';
+
 		do_filters($post);
 	}
-	
-	if ($post['has_file']) {	
+
+	if ($post['has_file']) {
 		foreach ($post['files'] as $key => &$file) {
 		if ($file['is_an_image']) {
 			if ($config['ie_mime_type_detection'] !== false) {
@ -787,6 +787,34 @@ if (isset($_POST['delete'])) {
 			$file['thumbwidth'] = $size[0];
 			$file['thumbheight'] = $size[1];
 		}
+
+		if ($config['tesseract_ocr']) { // Let's OCR it!
+			$fname = $file['tmp_name'];
+
+			if ($file['height'] > 500 || $file['width'] > 500) {
+				$fname = $file['thumb'];
+			}
+
+			if ($fname == 'spoiler') { // We don't have that much CPU time, do we?
+			}
+			else {
+				$tmpname = "tmp/tesseract/".rand(0,10000000);
+
+				// Preprocess command is an ImageMagick b/w quantization
+				$error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " .
+                                                          'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']);
+				$tmpname .= ".txt";
+
+				$value = @file_get_contents($tmpname);
+				@unlink($tmpname);
+
+				if ($value && trim($value)) {
+					// This one has an effect, that the body is appended to a post body. So you can write a correct
+					// spamfilter.
+					$post['body_nomarkup'] .= "<tinyboard ocr image $key>".htmlspecialchars($value)."</tinyboard>";
+				}
+			}
+		}
 		
 		if (!isset($dont_copy_file) || !$dont_copy_file) {
 			if (isset($file['file_tmp'])) {
@ -827,6 +855,11 @@ if (isset($_POST['delete'])) {
 		}
 	}
 	
+	// Do filters again if OCRing
+	if ($config['tesseract_ocr'] && !hasPermission($config['mod']['bypass_filters'], $board['uri'])) {
+		do_filters($post);
+	}
+
 	if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) {
 		undoImage($post);
 		if ($config['robot_mute']) {
--- a/tmp/tesseract/.gitkeep
+++ b/tmp/tesseract/.gitkeep