Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic length calculation #5

Merged
merged 5 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ which is a thin wrapper around `CompactEncDet::DetectEncoding` and `MimeEncoding

> ```ruby
> file = File.read("unknown-encoding.txt")
> result = CompactEncDet.detect_encoding(file, file.bytesize)
> result = CompactEncDet.detect_encoding(file)
> result.encoding
> # => #<Encoding:Windows-1250>
> result.bytes_consumed
Expand Down
23 changes: 12 additions & 11 deletions ext/compact_enc_det/compact_enc_det.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
// for the CompactEncDet::DetectEncoding C++ function
static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
{
VALUE ruby_text,
ruby_text_length,
VALUE text,
text_length,
url_hint,
http_charset_hint,
meta_charset_hint,
Expand All @@ -45,9 +45,9 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
ignore_7bit_mail_encodings;

// Parse the Ruby arguments
rb_scan_args(argc, argv, "27",
&ruby_text,
&ruby_text_length,
rb_scan_args(argc, argv, "17",
&text,
&text_length,
&url_hint,
&http_charset_hint,
&meta_charset_hint,
Expand All @@ -56,17 +56,18 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
&corpus_type,
&ignore_7bit_mail_encodings);

// Convert the Ruby values to C types
const char *text = StringValueCStr(ruby_text);
const int text_length = NUM2INT(ruby_text_length);
// Convert the Ruby arguments to C++ types
const char* c_text = StringValueCStr(text);
const int c_text_length = NIL_P(text_length) ? strlen(c_text) : NUM2INT(text_length);

// Declare the output variables
int bytes_consumed;
bool is_reliable;

// Detect the encoding using CompactEncDet::DetectEncoding
Encoding encoding = CompactEncDet::DetectEncoding(
text, text_length,
c_text,
c_text_length,
NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
Expand All @@ -76,11 +77,11 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
&bytes_consumed,
&is_reliable);

// Convert the encoding enum to string using MimeEncodingName
const char* encoding_mime_name = MimeEncodingName(encoding);
VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);

// Find the Ruby Encoding class
VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);

Expand Down
11 changes: 10 additions & 1 deletion test/compact_enc_det_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@
require_relative "../lib/compact_enc_det"

class CompactEncDetTest < Minitest::Test
def test_detect_encoding_known_english
def test_detect_encoding
text = File.read("test/fixtures/utf-8.txt")
result = CompactEncDet.detect_encoding(text)

assert_equal Encoding::UTF_8, result.encoding
assert_operator 0, :<, result.bytes_consumed
assert_equal true, result.is_reliable?
end

def test_detect_encoding_with_explicit_length
text = File.read("test/fixtures/utf-8.txt")
result = CompactEncDet.detect_encoding(text, text.bytesize)

Expand Down