diff --git a/README.md b/README.md index 7ce44e6..1bc885b 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ which is a thin wrapper around `CompactEncDet::DetectEncoding` and `MimeEncoding > ```ruby > file = File.read("unknown-encoding.txt") -> result = CompactEncDet.detect_encoding(file, file.bytesize) +> result = CompactEncDet.detect_encoding(file) > result.encoding > # => # > result.bytes_consumed diff --git a/ext/compact_enc_det/compact_enc_det.cc b/ext/compact_enc_det/compact_enc_det.cc index 3ffe0fd..8a29b46 100644 --- a/ext/compact_enc_det/compact_enc_det.cc +++ b/ext/compact_enc_det/compact_enc_det.cc @@ -34,8 +34,8 @@ void Init_detect_encoding_result(VALUE rb_mCompactEncDet) // for the CompactEncDet::DetectEncoding C++ function static VALUE detect_encoding(int argc, VALUE *argv, VALUE self) { - VALUE ruby_text, - ruby_text_length, + VALUE text, + text_length, url_hint, http_charset_hint, meta_charset_hint, @@ -45,9 +45,9 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self) ignore_7bit_mail_encodings; // Parse the Ruby arguments - rb_scan_args(argc, argv, "27", - &ruby_text, - &ruby_text_length, + rb_scan_args(argc, argv, "17", + &text, + &text_length, &url_hint, &http_charset_hint, &meta_charset_hint, @@ -56,9 +56,9 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self) &corpus_type, &ignore_7bit_mail_encodings); - // Convert the Ruby values to C types - const char *text = StringValueCStr(ruby_text); - const int text_length = NUM2INT(ruby_text_length); + // Convert the Ruby arguments to C++ types + const char* c_text = StringValueCStr(text); + const int c_text_length = NIL_P(text_length) ? strlen(c_text) : NUM2INT(text_length); // Declare the output variables int bytes_consumed; @@ -66,7 +66,8 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self) // Detect the encoding using CompactEncDet::DetectEncoding Encoding encoding = CompactEncDet::DetectEncoding( - text, text_length, + c_text, + c_text_length, NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint), NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint), NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint), @@ -76,11 +77,11 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self) NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings), &bytes_consumed, &is_reliable); - + // Convert the encoding enum to string using MimeEncodingName const char* encoding_mime_name = MimeEncodingName(encoding); VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name); - + // Find the Ruby Encoding class VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name); diff --git a/test/compact_enc_det_test.rb b/test/compact_enc_det_test.rb index e5241f8..8758422 100644 --- a/test/compact_enc_det_test.rb +++ b/test/compact_enc_det_test.rb @@ -2,7 +2,16 @@ require_relative "../lib/compact_enc_det" class CompactEncDetTest < Minitest::Test - def test_detect_encoding_known_english + def test_detect_encoding + text = File.read("test/fixtures/utf-8.txt") + result = CompactEncDet.detect_encoding(text) + + assert_equal Encoding::UTF_8, result.encoding + assert_operator 0, :<, result.bytes_consumed + assert_equal true, result.is_reliable? + end + + def test_detect_encoding_with_explicit_length text = File.read("test/fixtures/utf-8.txt") result = CompactEncDet.detect_encoding(text, text.bytesize)