From 94f90004df1361ef200744b14bc66e4a61a3e358 Mon Sep 17 00:00:00 2001 From: atsu1125 Date: Thu, 10 Nov 2022 16:08:55 +0900 Subject: [PATCH] Revert "Remove language detection through cld3 (#17478)" This reverts commit b6d7726ecbc833abd00f6a9d36b24d9776cfe623. --- Gemfile | 2 + Gemfile.lock | 5 + app/helpers/languages_helper.rb | 291 +++++------------- app/helpers/settings_helper.rb | 2 +- app/lib/activitypub/activity/create.rb | 6 +- app/lib/language_detector.rb | 101 ++++++ app/lib/link_details_extractor.rb | 9 +- app/models/user.rb | 4 - .../process_status_update_service.rb | 6 +- app/services/post_status_service.rb | 7 +- app/validators/import_validator.rb | 2 - .../settings/preferences/other/show.html.haml | 4 +- config/locales/en.yml | 2 +- lib/tasks/repo.rake | 5 +- spec/helpers/languages_helper_spec.rb | 6 +- spec/lib/language_detector_spec.rb | 134 ++++++++ 16 files changed, 348 insertions(+), 238 deletions(-) create mode 100644 app/lib/language_detector.rb create mode 100644 spec/lib/language_detector_spec.rb diff --git a/Gemfile b/Gemfile index ae999d9643b86f..9c055b93bc0aef 100644 --- a/Gemfile +++ b/Gemfile @@ -29,7 +29,9 @@ gem 'addressable', '~> 2.8' gem 'bootsnap', '~> 1.10.3', require: false gem 'browser' gem 'charlock_holmes', '~> 0.7.7' +gem 'iso-639' gem 'chewy', '~> 7.2' +gem 'cld3', '~> 3.4.4' gem 'devise', '~> 4.8' gem 'devise-two-factor', '~> 4.0' diff --git a/Gemfile.lock b/Gemfile.lock index 17fef6d487f326..28cceb6ebeac0f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -152,6 +152,8 @@ GEM elasticsearch (>= 7.12.0, < 7.14.0) elasticsearch-dsl chunky_png (1.4.0) + cld3 (3.4.4) + ffi (>= 1.1.0, < 1.16.0) climate_control (0.2.0) coderay (1.1.3) color_diff (0.1) @@ -301,6 +303,7 @@ GEM terminal-table (>= 1.5.1) idn-ruby (0.1.4) ipaddress (0.8.3) + iso-639 (0.3.5) jmespath (1.5.0) json (2.5.1) json-canonicalization (0.3.0) @@ -698,6 +701,7 @@ DEPENDENCIES capybara (~> 3.36) charlock_holmes (~> 0.7.7) chewy (~> 7.2) + cld3 (~> 3.4.4) climate_control (~> 0.2) color_diff (~> 0.1) concurrent-ruby @@ -725,6 +729,7 @@ DEPENDENCIES httplog (~> 1.5.0) i18n-tasks (~> 0.9) idn-ruby + iso-639 json-ld json-ld-preloaded (~> 3.2) kaminari (~> 1.2) diff --git a/app/helpers/languages_helper.rb b/app/helpers/languages_helper.rb index f3ed7b3140c811..73072420864c9b 100644 --- a/app/helpers/languages_helper.rb +++ b/app/helpers/languages_helper.rb @@ -1,237 +1,94 @@ # frozen_string_literal: true module LanguagesHelper - ISO_639_1 = { - aa: ['Afar', 'Afaraf'].freeze, - ab: ['Abkhaz', 'аҧсуа бызшәа'].freeze, - ae: ['Avestan', 'avesta'].freeze, - af: ['Afrikaans', 'Afrikaans'].freeze, - ak: ['Akan', 'Akan'].freeze, - am: ['Amharic', 'አማርኛ'].freeze, - an: ['Aragonese', 'aragonés'].freeze, - ar: ['Arabic', 'اللغة العربية'].freeze, - as: ['Assamese', 'অসমীয়া'].freeze, - av: ['Avaric', 'авар мацӀ'].freeze, - ay: ['Aymara', 'aymar aru'].freeze, - az: ['Azerbaijani', 'azərbaycan dili'].freeze, - ba: ['Bashkir', 'башҡорт теле'].freeze, - be: ['Belarusian', 'беларуская мова'].freeze, - bg: ['Bulgarian', 'български език'].freeze, - bh: ['Bihari', 'भोजपुरी'].freeze, - bi: ['Bislama', 'Bislama'].freeze, - bm: ['Bambara', 'bamanankan'].freeze, - bn: ['Bengali', 'বাংলা'].freeze, - bo: ['Tibetan', 'བོད་ཡིག'].freeze, - br: ['Breton', 'brezhoneg'].freeze, - bs: ['Bosnian', 'bosanski jezik'].freeze, - ca: ['Catalan', 'Català'].freeze, - ce: ['Chechen', 'нохчийн мотт'].freeze, - ch: ['Chamorro', 'Chamoru'].freeze, - co: ['Corsican', 'corsu'].freeze, - cr: ['Cree', 'ᓀᐦᐃᔭᐍᐏᐣ'].freeze, - cs: ['Czech', 'čeština'].freeze, - cu: ['Old Church Slavonic', 'ѩзыкъ словѣньскъ'].freeze, - cv: ['Chuvash', 'чӑваш чӗлхи'].freeze, - cy: ['Welsh', 'Cymraeg'].freeze, - da: ['Danish', 'dansk'].freeze, - de: ['German', 'Deutsch'].freeze, - dv: ['Divehi', 'Dhivehi'].freeze, - dz: ['Dzongkha', 'རྫོང་ཁ'].freeze, - ee: ['Ewe', 'Eʋegbe'].freeze, - el: ['Greek', 'Ελληνικά'].freeze, - en: ['English', 'English'].freeze, - eo: ['Esperanto', 'Esperanto'].freeze, - es: ['Spanish', 'Español'].freeze, - et: ['Estonian', 'eesti'].freeze, - eu: ['Basque', 'euskara'].freeze, - fa: ['Persian', 'فارسی'].freeze, - ff: ['Fula', 'Fulfulde'].freeze, - fi: ['Finnish', 'suomi'].freeze, - fj: ['Fijian', 'Vakaviti'].freeze, - fo: ['Faroese', 'føroyskt'].freeze, - fr: ['French', 'Français'].freeze, - fy: ['Western Frisian', 'Frysk'].freeze, - ga: ['Irish', 'Gaeilge'].freeze, - gd: ['Scottish Gaelic', 'Gàidhlig'].freeze, - gl: ['Galician', 'galego'].freeze, - gu: ['Gujarati', 'ગુજરાતી'].freeze, - gv: ['Manx', 'Gaelg'].freeze, - ha: ['Hausa', 'هَوُسَ'].freeze, - he: ['Hebrew', 'עברית'].freeze, - hi: ['Hindi', 'हिन्दी'].freeze, - ho: ['Hiri Motu', 'Hiri Motu'].freeze, - hr: ['Croatian', 'Hrvatski'].freeze, - ht: ['Haitian', 'Kreyòl ayisyen'].freeze, - hu: ['Hungarian', 'magyar'].freeze, - hy: ['Armenian', 'Հայերեն'].freeze, - hz: ['Herero', 'Otjiherero'].freeze, - ia: ['Interlingua', 'Interlingua'].freeze, - id: ['Indonesian', 'Bahasa Indonesia'].freeze, - ie: ['Interlingue', 'Interlingue'].freeze, - ig: ['Igbo', 'Asụsụ Igbo'].freeze, - ii: ['Nuosu', 'ꆈꌠ꒿ Nuosuhxop'].freeze, - ik: ['Inupiaq', 'Iñupiaq'].freeze, - io: ['Ido', 'Ido'].freeze, - is: ['Icelandic', 'Íslenska'].freeze, - it: ['Italian', 'Italiano'].freeze, - iu: ['Inuktitut', 'ᐃᓄᒃᑎᑐᑦ'].freeze, - ja: ['Japanese', '日本語'].freeze, - jv: ['Javanese', 'basa Jawa'].freeze, - ka: ['Georgian', 'ქართული'].freeze, - kg: ['Kongo', 'Kikongo'].freeze, - ki: ['Kikuyu', 'Gĩkũyũ'].freeze, - kj: ['Kwanyama', 'Kuanyama'].freeze, - kk: ['Kazakh', 'қазақ тілі'].freeze, - kl: ['Kalaallisut', 'kalaallisut'].freeze, - km: ['Khmer', 'ខេមរភាសា'].freeze, - kn: ['Kannada', 'ಕನ್ನಡ'].freeze, - ko: ['Korean', '한국어'].freeze, - kr: ['Kanuri', 'Kanuri'].freeze, - ks: ['Kashmiri', 'कश्मीरी'].freeze, - ku: ['Kurdish', 'Kurdî'].freeze, - kv: ['Komi', 'коми кыв'].freeze, - kw: ['Cornish', 'Kernewek'].freeze, - ky: ['Kyrgyz', 'Кыргызча'].freeze, - la: ['Latin', 'latine'].freeze, - lb: ['Luxembourgish', 'Lëtzebuergesch'].freeze, - lg: ['Ganda', 'Luganda'].freeze, - li: ['Limburgish', 'Limburgs'].freeze, - ln: ['Lingala', 'Lingála'].freeze, - lo: ['Lao', 'ພາສາ'].freeze, - lt: ['Lithuanian', 'lietuvių kalba'].freeze, - lu: ['Luba-Katanga', 'Tshiluba'].freeze, - lv: ['Latvian', 'latviešu valoda'].freeze, - mg: ['Malagasy', 'fiteny malagasy'].freeze, - mh: ['Marshallese', 'Kajin M̧ajeļ'].freeze, - mi: ['Māori', 'te reo Māori'].freeze, - mk: ['Macedonian', 'македонски јазик'].freeze, - ml: ['Malayalam', 'മലയാളം'].freeze, - mn: ['Mongolian', 'Монгол хэл'].freeze, - mr: ['Marathi', 'मराठी'].freeze, - ms: ['Malay', 'Bahasa Malaysia'].freeze, - mt: ['Maltese', 'Malti'].freeze, - my: ['Burmese', 'ဗမာစာ'].freeze, - na: ['Nauru', 'Ekakairũ Naoero'].freeze, - nb: ['Norwegian Bokmål', 'Norsk bokmål'].freeze, - nd: ['Northern Ndebele', 'isiNdebele'].freeze, - ne: ['Nepali', 'नेपाली'].freeze, - ng: ['Ndonga', 'Owambo'].freeze, - nl: ['Dutch', 'Nederlands'].freeze, - nn: ['Norwegian Nynorsk', 'Norsk nynorsk'].freeze, - no: ['Norwegian', 'Norsk'].freeze, - nr: ['Southern Ndebele', 'isiNdebele'].freeze, - nv: ['Navajo', 'Diné bizaad'].freeze, - ny: ['Chichewa', 'chiCheŵa'].freeze, - oc: ['Occitan', 'occitan'].freeze, - oj: ['Ojibwe', 'ᐊᓂᔑᓈᐯᒧᐎᓐ'].freeze, - om: ['Oromo', 'Afaan Oromoo'].freeze, - or: ['Oriya', 'ଓଡ଼ିଆ'].freeze, - os: ['Ossetian', 'ирон æвзаг'].freeze, - pa: ['Panjabi', 'ਪੰਜਾਬੀ'].freeze, - pi: ['Pāli', 'पाऴि'].freeze, - pl: ['Polish', 'Polski'].freeze, - ps: ['Pashto', 'پښتو'].freeze, - pt: ['Portuguese', 'Português'].freeze, - qu: ['Quechua', 'Runa Simi'].freeze, - rm: ['Romansh', 'rumantsch grischun'].freeze, - rn: ['Kirundi', 'Ikirundi'].freeze, - ro: ['Romanian', 'Română'].freeze, - ru: ['Russian', 'Русский'].freeze, - rw: ['Kinyarwanda', 'Ikinyarwanda'].freeze, - sa: ['Sanskrit', 'संस्कृतम्'].freeze, - sc: ['Sardinian', 'sardu'].freeze, - sd: ['Sindhi', 'सिन्धी'].freeze, - se: ['Northern Sami', 'Davvisámegiella'].freeze, - sg: ['Sango', 'yângâ tî sängö'].freeze, - si: ['Sinhala', 'සිංහල'].freeze, - sk: ['Slovak', 'slovenčina'].freeze, - sl: ['Slovenian', 'slovenščina'].freeze, - sn: ['Shona', 'chiShona'].freeze, - so: ['Somali', 'Soomaaliga'].freeze, - sq: ['Albanian', 'Shqip'].freeze, - sr: ['Serbian', 'српски језик'].freeze, - ss: ['Swati', 'SiSwati'].freeze, - st: ['Southern Sotho', 'Sesotho'].freeze, - su: ['Sundanese', 'Basa Sunda'].freeze, - sv: ['Swedish', 'Svenska'].freeze, - sw: ['Swahili', 'Kiswahili'].freeze, - ta: ['Tamil', 'தமிழ்'].freeze, - te: ['Telugu', 'తెలుగు'].freeze, - tg: ['Tajik', 'тоҷикӣ'].freeze, - th: ['Thai', 'ไทย'].freeze, - ti: ['Tigrinya', 'ትግርኛ'].freeze, - tk: ['Turkmen', 'Türkmen'].freeze, - tl: ['Tagalog', 'Wikang Tagalog'].freeze, - tn: ['Tswana', 'Setswana'].freeze, - to: ['Tonga', 'faka Tonga'].freeze, - tr: ['Turkish', 'Türkçe'].freeze, - ts: ['Tsonga', 'Xitsonga'].freeze, - tt: ['Tatar', 'татар теле'].freeze, - tw: ['Twi', 'Twi'].freeze, - ty: ['Tahitian', 'Reo Tahiti'].freeze, - ug: ['Uyghur', 'ئۇيغۇرچە‎'].freeze, - uk: ['Ukrainian', 'Українська'].freeze, - ur: ['Urdu', 'اردو'].freeze, - uz: ['Uzbek', 'Ўзбек'].freeze, - ve: ['Venda', 'Tshivenḓa'].freeze, - vi: ['Vietnamese', 'Tiếng Việt'].freeze, - vo: ['Volapük', 'Volapük'].freeze, - wa: ['Walloon', 'walon'].freeze, - wo: ['Wolof', 'Wollof'].freeze, - xh: ['Xhosa', 'isiXhosa'].freeze, - yi: ['Yiddish', 'ייִדיש'].freeze, - yo: ['Yoruba', 'Yorùbá'].freeze, - za: ['Zhuang', 'Saɯ cueŋƅ'].freeze, - zh: ['Chinese', '中文'].freeze, - zu: ['Zulu', 'isiZulu'].freeze, - }.freeze - - ISO_639_3 = { - ast: ['Asturian', 'Asturianu'].freeze, - kab: ['Kabyle', 'Taqbaylit'].freeze, - kmr: ['Northern Kurdish', 'Kurmancî'].freeze, - zgh: ['Standard Moroccan Tamazight', 'ⵜⴰⵎⴰⵣⵉⵖⵜ'].freeze, - }.freeze - - SUPPORTED_LOCALES = {}.merge(ISO_639_1).merge(ISO_639_3).freeze - - # For ISO-639-1 and ISO-639-3 language codes, we have their official - # names, but for some translations, we need the names of the - # regional variants specifically - REGIONAL_LOCALE_NAMES = { + HUMAN_LOCALES = { + af: 'Afrikaans', + ar: 'العربية', + ast: 'Asturianu', + bg: 'Български', + bn: 'বাংলা', + br: 'Breton', + ca: 'Català', + co: 'Corsu', + cs: 'Čeština', + cy: 'Cymraeg', + da: 'Dansk', + de: 'Deutsch', + el: 'Ελληνικά', + en: 'English', + eo: 'Esperanto', 'es-AR': 'Español (Argentina)', 'es-MX': 'Español (México)', + es: 'Español', + et: 'Eesti', + eu: 'Euskara', + fa: 'فارسی', + fi: 'Suomi', + fr: 'Français', + ga: 'Gaeilge', + gd: 'Gàidhlig', + gl: 'Galego', + he: 'עברית', + hi: 'हिन्दी', + hr: 'Hrvatski', + hu: 'Magyar', + hy: 'Հայերեն', + id: 'Bahasa Indonesia', + io: 'Ido', + is: 'Íslenska', + it: 'Italiano', + ja: '日本語', + ka: 'ქართული', + kab: 'Taqbaylit', + kk: 'Қазақша', + kmr: 'Kurmancî', + kn: 'ಕನ್ನಡ', + ko: '한국어', + ku: 'سۆرانی', + lt: 'Lietuvių', + lv: 'Latviešu', + mk: 'Македонски', + ml: 'മലയാളം', + mr: 'मराठी', + ms: 'Bahasa Melayu', + nl: 'Nederlands', + nn: 'Nynorsk', + no: 'Norsk', + oc: 'Occitan', + pl: 'Polski', 'pt-BR': 'Português (Brasil)', 'pt-PT': 'Português (Portugal)', + pt: 'Português', + ro: 'Română', + ru: 'Русский', + sa: 'संस्कृतम्', + sc: 'Sardu', + si: 'සිංහල', + sk: 'Slovenčina', + sl: 'Slovenščina', + sq: 'Shqip', 'sr-Latn': 'Srpski (latinica)', + sr: 'Српски', + sv: 'Svenska', + ta: 'தமிழ்', + te: 'తెలుగు', + th: 'ไทย', + tr: 'Türkçe', + uk: 'Українська', + ur: 'اُردُو', + vi: 'Tiếng Việt', + zgh: 'ⵜⴰⵎⴰⵣⵉⵖⵜ', 'zh-CN': '简体中文', 'zh-HK': '繁體中文(香港)', 'zh-TW': '繁體中文(臺灣)', + zh: '中文', }.freeze def human_locale(locale) if locale == 'und' I18n.t('generic.none') - elsif (supported_locale = SUPPORTED_LOCALES[locale.to_sym]) - supported_locale[1] - elsif (regional_locale = REGIONAL_LOCALE_NAMES[locale.to_sym]) - regional_locale else - locale + HUMAN_LOCALES[locale.to_sym] || locale end end - - def valid_locale_or_nil(str) - return if str.blank? - - code, = str.to_s.split(/[_-]/) # Strip out the region from e.g. en_US or ja-JP - - return unless valid_locale?(code) - - code - end - - def valid_locale?(locale) - SUPPORTED_LOCALES.key?(locale.to_sym) - end end diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb index 3d5592867cfe26..23739d1cd4eb9d 100644 --- a/app/helpers/settings_helper.rb +++ b/app/helpers/settings_helper.rb @@ -2,7 +2,7 @@ module SettingsHelper def filterable_languages - LanguagesHelper::SUPPORTED_LOCALES.keys + LanguageDetector.instance.language_names.select(&LanguagesHelper::HUMAN_LOCALES.method(:key?)) end def hash_to_object(hash) diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index cf31b6ff623a19..ad273c20bb0bb0 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -112,7 +112,7 @@ def process_status_params url: @status_parser.url || @status_parser.uri, account: @account, text: converted_object_type? ? converted_text : (@status_parser.text || ''), - language: @status_parser.language, + language: @status_parser.language || detected_language, spoiler_text: converted_object_type? ? '' : (@status_parser.spoiler_text || ''), created_at: @status_parser.created_at, edited_at: @status_parser.edited_at, @@ -370,6 +370,10 @@ def converted_text Formatter.instance.linkify([@status_parser.title.presence, @status_parser.spoiler_text.presence, @status_parser.url || @status_parser.uri].compact.join("\n\n")) end + def detected_language + LanguageDetector.instance.detect(@status_parser.text, @account) if supported_object_type? + end + def unsupported_media_type?(mime_type) mime_type.present? && !MediaAttachment.supported_mime_types.include?(mime_type) end diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb new file mode 100644 index 00000000000000..40452eddc96df1 --- /dev/null +++ b/app/lib/language_detector.rb @@ -0,0 +1,101 @@ +# frozen_string_literal: true + +class LanguageDetector + include Singleton + + WORDS_THRESHOLD = 4 + RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\p{Thai}]+/m + + def initialize + @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) + end + + def detect(text, account) + input_text = prepare_text(text) + + return if input_text.blank? + + detect_language_code(input_text) || default_locale(account) + end + + def language_names + @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq + end + + private + + def prepare_text(text) + simplify_text(text).strip + end + + def unreliable_input?(text) + !reliable_input?(text) + end + + def reliable_input?(text) + sufficient_text_length?(text) || language_specific_character_set?(text) + end + + def sufficient_text_length?(text) + text.split(/\s+/).size >= WORDS_THRESHOLD + end + + def language_specific_character_set?(text) + words = text.scan(RELIABLE_CHARACTERS_RE) + + if words.present? + words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size > 0.3 + else + false + end + end + + def detect_language_code(text) + return if unreliable_input?(text) + + result = @identifier.find_language(text) + + iso6391(result.language.to_s).to_sym if result&.reliable? + end + + def iso6391(bcp47) + iso639 = bcp47.split('-').first + + # CLD3 returns grandfathered language code for Hebrew + return 'he' if iso639 == 'iw' + + ISO_639.find(iso639).alpha2 + end + + def simplify_text(text) + new_text = remove_html(text) + new_text.gsub!(FetchLinkCardService::URL_PATTERN, '\1') + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE) { |string| string.gsub(/[#_]/, '#' => '', '_' => ' ').gsub(/[a-z][A-Z]|[a-zA-Z][\d]/) { |s| s.insert(1, ' ') }.downcase } + new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') + new_text.gsub!(/\s+/, ' ') + new_text + end + + def new_scrubber + scrubber = Rails::Html::PermitScrubber.new + scrubber.tags = %w(br p) + scrubber + end + + def scrubber + @scrubber ||= new_scrubber + end + + def remove_html(text) + text = Loofah.fragment(text).scrub!(scrubber).to_s + text.gsub!('
', "\n") + text.gsub!('

', "\n\n") + text.gsub!(/(^

|<\/p>$)/, '') + text + end + + def default_locale(account) + account.user_locale&.to_sym || I18n.default_locale if account.local? + end +end diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index fabbd244df9b15..d2bcf0c2505596 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -2,7 +2,6 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper - include LanguagesHelper # Some publications wrap their JSON-LD data in their