Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: gzip all JSON OCRs when saving OCR file on disk #8320

Merged
merged 1 commit into from
May 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions lib/ProductOpener/Images.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ use ProductOpener::URL qw/:all/;
use ProductOpener::Users qw/:all/;
use ProductOpener::Text qw/:all/;

use IO::Compress::Gzip qw(gzip $GzipError);
use Log::Any qw($log);
use Encode;
use JSON::PP;
Expand Down Expand Up @@ -2002,7 +2003,7 @@ sub extract_text_from_image ($product_ref, $id, $field, $ocr_engine, $results_re
}
elsif ($ocr_engine eq 'google_cloud_vision') {

my $json_file = "$www_root/images/products/$path/$filename.json";
my $json_file = "$www_root/images/products/$path/$filename.json.gz";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
my $cloudvision_ref = send_image_to_cloud_vision($image, $json_file, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close $gv_logs;
Expand Down Expand Up @@ -2047,7 +2048,7 @@ Call to Google Cloud vision API

=head4 $image_path - str path to image

=head4 $json_file - str path to the file where we will store OCR result as JSON
=head4 $json_file - str path to the file where we will store OCR result as gzipped JSON

=head4 $features_ref - hash reference - the "features" parameter of Google Cloud Vision

Expand Down Expand Up @@ -2108,14 +2109,17 @@ sub send_image_to_cloud_vision ($image_path, $json_file, $features_ref, $gv_logs

$cloudvision_ref = decode_json($json_response);

$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();
# Adding creation timestamp, to know when the OCR has been generated
$cloudvision_ref->{created_at} = time();

# UTF-8 issue , see https://stackoverflow.com/questions/4572007/perl-lwpuseragent-mishandling-utf-8-response
$json_response = decode("utf8", $json_response);
$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();

if (open(my $OUT, ">:encoding(UTF-8)", $json_file)) {
print($OUT $json_response);
close($OUT);
if (open(my $OUT, ">:raw", $json_file)) {
my $gzip_handle = IO::Compress::Gzip->new($OUT)
or die "Cannot create gzip filehandle: $GzipError\n";
my $encoded_json = encode_json($cloudvision_ref);
$gzip_handle->print($encoded_json);
$gzip_handle->close;

print($gv_logs "--> cloud vision success for $image_path\n");
}
Expand Down
44 changes: 44 additions & 0 deletions lib/ProductOpener/Test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ BEGIN {
&remove_all_orgs
&check_not_production
&wait_for
&read_gzip_file
&check_ocr_result
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand All @@ -66,15 +68,57 @@ use ProductOpener::Store "store";
use Carp qw/confess/;
use Data::DeepAccess qw(deep_exists deep_get deep_set);
use Getopt::Long;
use IO::Uncompress::AnyInflate qw(anyinflate $AnyInflateError);
use Test::More;
use JSON "decode_json";
use File::Basename "fileparse";
use File::Path qw/make_path remove_tree/;
use File::Copy;
use Path::Tiny qw/path/;
use Scalar::Util qw(looks_like_number);

use Log::Any qw($log);

=head2 read_gzip_file($filepath)

Read gzipped file and return binary content

=head3 Parameters

=head4 String $filepath
The path of the gzipped file.

=cut

sub read_gzip_file ($filepath) {
my $input = IO::File->new($filepath) or die "Cannot open '$filepath'\n";
my $buffer;
anyinflate $input => \$buffer or die "anyinflate failed: $AnyInflateError\n";
return $buffer;
}

=head2 check_ocr_result($ocr_result)

Check that OCR result returned by Google Cloud Vision is as expected:
- a single [response] object in `responses` field
- `created_at` integer field

=head3 Parameters

=head4 String $ocr_result
String of OCR result JSON as returned by Google Cloud Vision.

=cut

sub check_ocr_result ($ocr_result) {
ok(defined $ocr_result->{responses}, "OCR result contains the 'responses' field");
my @responses = $ocr_result->{responses};
my $created_at = $ocr_result->{created_at};
is(scalar @responses, 1, "OCR result contains a single response");
ok((defined $created_at and looks_like_number($created_at)), "OCR result `created_at` field is valid, $created_at");
return;
}

=head2 init_expected_results($filepath)

Handles test options around expected_results initialization
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_cloud_vision_ocr.pl
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ ($file)

my $json_file = $destination;
$json_file =~ s/\.([^\.]+)$//;
$json_file .= ".json";
$json_file .= ".json.gz";

print $LOG "file: $file destination: $destination code: $code image_url: $image_url json_file: $json_file\n";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
Expand Down
1 change: 1 addition & 0 deletions stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,4 @@ weigher
weighers
www
xml
gzipped

This file was deleted.

6 changes: 3 additions & 3 deletions tests/integration/run_cloud_vision_ocr.t
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dircopy("$sample_products_images_path/$product_code_path", $image_dir);
fcopy($input_image_path, "$image_dir/2.jpg");
# fake responses for OCR and robtoff
my @responses = (
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"ocr": "success"}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"responses": [{}]}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"robotoff": "success"}'),
);
my $dump_path = File::Temp->newdir();
Expand All @@ -56,10 +56,10 @@ is(scalar @requests, 2, "Two request issued");
my $ocr_request = retrieve("$dump_path/req-0.sto");
my $request_json_body = decode_json($ocr_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/ocr_request_body.json", $update_expected_results);
my $ocr_content = read_file("$image_dir/2.json");
my $ocr_content = read_gzip_file("$image_dir/2.json.gz");
ok($ocr_content, "OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $robotoff_request = retrieve("$dump_path/req-1.sto");
# we have url encoded parameters, and order might change --> convert to hash
my $request_content = url_params_mixed($robotoff_request->content());
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

31 changes: 17 additions & 14 deletions tests/unit/send_image_to_cloud_vision.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ use ProductOpener::Images qw/:all/;

my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__));

# Default OCR response, containing a single response element
my $ocr_default_response = '{"responses": [{}]}';

my @ua_requests = ();
# put responses for call to requests here, we will pop first
my @ua_responses = ();
Expand All @@ -35,26 +38,26 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# normal test
open(my $gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
my $json_path = $tmp_dir . "/small-img.json";
my $json_path = $tmp_dir . "/small-img.json.gz";
# expected response
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
is(scalar @ua_requests, 1, "Normal test - One request issued to cloud vision");
my $issued_request = shift @ua_requests;
my $request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body.json", $update_expected_results);
my $ocr_content = read_file($json_path);
my $ocr_content = read_gzip_file($json_path);
ok($ocr_content, "normal test - OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "normal test - cloud vision success in logs");

# test new request updates
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand All @@ -63,15 +66,15 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_2.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_2.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request update - cloud vision success in logs");

# test with different feature set \@CLOUD_VISION_FEATURES_TEXT
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close($gv_logs);
Expand All @@ -80,19 +83,19 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_3.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_3.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request features text - cloud vision success in logs");

# test with bad json path
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision(
$image_path,
"/var/lib/not-a-directory/not-writable.json",
"/var/lib/not-a-directory/not-writable.json.gz",
\@CLOUD_VISION_FEATURES_FULL, $gv_logs
);
close($gv_logs);
Expand All @@ -105,8 +108,8 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# test bad request
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$json_path = $tmp_dir . "/small-img2.json";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), '{"foo": "blah"}');
$json_path = $tmp_dir . "/small-img2.json.gz";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand Down