From 7de2f7fde22fc80c1647e36a68838038761b0d6f Mon Sep 17 00:00:00 2001 From: Alexander Azarov Date: Sat, 31 Aug 2024 10:46:57 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20more=20Meta=20crawlers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- lib/legitbot.rb | 1 + lib/legitbot/facebook.rb | 12 ++--- lib/legitbot/meta.rb | 33 ++++++++++++++ test/facebook_test.rb | 13 ------ test/lib/meta_ip_ranges_mock.rb | 13 ++++++ test/meta_test.rb | 79 +++++++++++++++++++++++++++++++++ test/test_helper.rb | 1 + 8 files changed, 131 insertions(+), 23 deletions(-) create mode 100644 lib/legitbot/meta.rb create mode 100644 test/lib/meta_ip_ranges_mock.rb create mode 100644 test/meta_test.rb diff --git a/README.md b/README.md index a9c978c..40ecfca 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,12 @@ end - [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973) - [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/) - [DuckDuckGo bot](https://duckduckgo.com/duckduckbot) -- [Facebook crawler](https://developers.facebook.com/docs/sharing/webmasters/crawler) - [Google crawlers](https://support.google.com/webmasters/answer/1061943) - [IAS](https://integralads.com/ias-privacy-data-management/policies/site-indexing-policy/) - [OpenAI GPTBot](https://platform.openai.com/docs/gptbot) - [Oracle Data Cloud Crawler](https://www.oracle.com/corporate/acquisitions/grapeshot/crawler.html) - [Marginalia](https://www.marginalia.nu/marginalia-search/for-webmasters/) +- [Meta / Facebook Web crawlers](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/) - [Petal search engine](http://aspiegel.com/petalbot) - [Pinterest](https://help.pinterest.com/en/articles/about-pinterest-crawler-0) - [Twitterbot](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started), diff --git a/lib/legitbot.rb b/lib/legitbot.rb index c2cd1ff..143c012 100644 --- a/lib/legitbot.rb +++ b/lib/legitbot.rb @@ -15,6 +15,7 @@ require_relative 'legitbot/ias' require_relative 'legitbot/oracle' require_relative 'legitbot/marginalia' +require_relative 'legitbot/meta' require_relative 'legitbot/petalbot' require_relative 'legitbot/pinterest' require_relative 'legitbot/twitter' diff --git a/lib/legitbot/facebook.rb b/lib/legitbot/facebook.rb index dad11e9..2f3e8fb 100644 --- a/lib/legitbot/facebook.rb +++ b/lib/legitbot/facebook.rb @@ -1,20 +1,14 @@ # frozen_string_literal: true -require 'irrc' +require_relative 'meta' module Legitbot # :nodoc: # https://developers.facebook.com/docs/sharing/webmasters/crawler class Facebook < BotMatch - AS = 'AS32934' + extend MetaIpRanges ip_ranges do - client = Irrc::Client.new - client.query :radb, AS, source: :radb - results = client.perform - - %i[ipv4 ipv6].map do |family| - results[AS][family][AS] - end.flatten + fetch_ip_ranges end end diff --git a/lib/legitbot/meta.rb b/lib/legitbot/meta.rb new file mode 100644 index 0000000..5405221 --- /dev/null +++ b/lib/legitbot/meta.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'irrc' + +module Legitbot # :nodoc: + module MetaIpRanges # :nodoc: + AS = 'AS32934' + + def fetch_ip_ranges + client = Irrc::Client.new + client.query :radb, AS, source: :radb + results = client.perform + + %i[ipv4 ipv6].map do |family| + results[AS][family][AS] + end.flatten + end + end + + # https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/ + class Meta < BotMatch + extend MetaIpRanges + + ip_ranges do + fetch_ip_ranges + end + end + + rule Legitbot::Meta, %w[ + meta-externalagent + meta-externalfetcher + ] +end diff --git a/test/facebook_test.rb b/test/facebook_test.rb index d4b8b32..1b05577 100644 --- a/test/facebook_test.rb +++ b/test/facebook_test.rb @@ -2,19 +2,6 @@ require_relative 'test_helper' -module Legitbot - class Facebook - # rubocop:disable Layout/LineLength - def self.whois - { - ipv4: ['69.63.176.0/20', '66.220.144.0/20', '66.220.144.0/21', '69.63.184.0/21', '69.63.176.0/21', '74.119.76.0/22', '69.171.255.0/24', '173.252.64.0/18', '69.171.224.0/19', '69.171.224.0/20', '103.4.96.0/22', '69.63.176.0/24', '173.252.64.0/19', '173.252.70.0/24', '31.13.64.0/18', '31.13.24.0/21', '66.220.152.0/21', '66.220.159.0/24', '69.171.239.0/24', '69.171.240.0/20', '31.13.64.0/19', '31.13.64.0/24', '31.13.65.0/24', '31.13.67.0/24', '31.13.68.0/24', '31.13.69.0/24', '31.13.70.0/24', '31.13.71.0/24', '31.13.72.0/24', '31.13.73.0/24', '31.13.74.0/24', '31.13.75.0/24', '31.13.76.0/24', '31.13.77.0/24', '31.13.96.0/19', '31.13.66.0/24', '173.252.96.0/19', '69.63.178.0/24', '31.13.78.0/24', '31.13.79.0/24', '31.13.80.0/24', '31.13.82.0/24', '31.13.83.0/24', '31.13.84.0/24', '31.13.85.0/24', '31.13.86.0/24', '31.13.87.0/24', '31.13.88.0/24', '31.13.89.0/24', '31.13.90.0/24', '31.13.91.0/24', '31.13.92.0/24', '31.13.93.0/24', '31.13.94.0/24', '31.13.95.0/24', '69.171.253.0/24', '69.63.186.0/24', '31.13.81.0/24', '179.60.192.0/22', '179.60.192.0/24', '179.60.193.0/24', '179.60.194.0/24', '179.60.195.0/24', '185.60.216.0/22', '45.64.40.0/22', '185.60.216.0/24', '185.60.217.0/24', '185.60.218.0/24', '185.60.219.0/24', '129.134.0.0/16', '157.240.0.0/16', '157.240.8.0/24', '157.240.0.0/24', '157.240.1.0/24', '157.240.2.0/24', '157.240.3.0/24', '157.240.4.0/24', '157.240.5.0/24', '157.240.6.0/24', '157.240.7.0/24', '157.240.9.0/24', '157.240.10.0/24', '157.240.16.0/24', '157.240.19.0/24', '157.240.11.0/24', '157.240.12.0/24', '157.240.13.0/24', '157.240.14.0/24', '157.240.15.0/24', '157.240.17.0/24', '157.240.18.0/24', '157.240.20.0/24', '157.240.21.0/24', '157.240.22.0/24', '157.240.23.0/24', '157.240.0.0/17', '69.171.250.0/24', '157.240.24.0/24', '157.240.25.0/24', '199.201.64.0/24', '199.201.65.0/24', '199.201.64.0/22', '204.15.20.0/22', '157.240.192.0/24', '129.134.0.0/17', '157.240.198.0/24'], - ipv6: [] - } - end - # rubocop:enable Layout/LineLength - end -end - class FacebookTest < Minitest::Test def test_valid_ip ip = '69.63.186.89' diff --git a/test/lib/meta_ip_ranges_mock.rb b/test/lib/meta_ip_ranges_mock.rb new file mode 100644 index 0000000..3df26f3 --- /dev/null +++ b/test/lib/meta_ip_ranges_mock.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Legitbot + module MetaIpRanges + alias fetch_ip_ranges_orig fetch_ip_ranges + + # rubocop:disable Layout/LineLength + def fetch_ip_ranges + ['69.63.176.0/20', '66.220.144.0/20', '66.220.144.0/21', '69.63.184.0/21', '69.63.176.0/21', '74.119.76.0/22', '69.171.255.0/24', '173.252.64.0/18', '69.171.224.0/19', '69.171.224.0/20', '103.4.96.0/22', '69.63.176.0/24', '173.252.64.0/19', '173.252.70.0/24', '31.13.64.0/18', '31.13.24.0/21', '66.220.152.0/21', '66.220.159.0/24', '69.171.239.0/24', '69.171.240.0/20', '31.13.64.0/19', '31.13.64.0/24', '31.13.65.0/24', '31.13.67.0/24', '31.13.68.0/24', '31.13.69.0/24', '31.13.70.0/24', '31.13.71.0/24', '31.13.72.0/24', '31.13.73.0/24', '31.13.74.0/24', '31.13.75.0/24', '31.13.76.0/24', '31.13.77.0/24', '31.13.96.0/19', '31.13.66.0/24', '173.252.96.0/19', '69.63.178.0/24', '31.13.78.0/24', '31.13.79.0/24', '31.13.80.0/24', '31.13.82.0/24', '31.13.83.0/24', '31.13.84.0/24', '31.13.85.0/24', '31.13.86.0/24', '31.13.87.0/24', '31.13.88.0/24', '31.13.89.0/24', '31.13.90.0/24', '31.13.91.0/24', '31.13.92.0/24', '31.13.93.0/24', '31.13.94.0/24', '31.13.95.0/24', '69.171.253.0/24', '69.63.186.0/24', '31.13.81.0/24', '179.60.192.0/22', '179.60.192.0/24', '179.60.193.0/24', '179.60.194.0/24', '179.60.195.0/24', '185.60.216.0/22', '45.64.40.0/22', '185.60.216.0/24', '185.60.217.0/24', '185.60.218.0/24', '185.60.219.0/24', '129.134.0.0/16', '157.240.0.0/16', '157.240.8.0/24', '157.240.0.0/24', '157.240.1.0/24', '157.240.2.0/24', '157.240.3.0/24', '157.240.4.0/24', '157.240.5.0/24', '157.240.6.0/24', '157.240.7.0/24', '157.240.9.0/24', '157.240.10.0/24', '157.240.16.0/24', '157.240.19.0/24', '157.240.11.0/24', '157.240.12.0/24', '157.240.13.0/24', '157.240.14.0/24', '157.240.15.0/24', '157.240.17.0/24', '157.240.18.0/24', '157.240.20.0/24', '157.240.21.0/24', '157.240.22.0/24', '157.240.23.0/24', '157.240.0.0/17', '69.171.250.0/24', '157.240.24.0/24', '157.240.25.0/24', '199.201.64.0/24', '199.201.65.0/24', '199.201.64.0/22', '204.15.20.0/22', '157.240.192.0/24', '129.134.0.0/17', '157.240.198.0/24'] + end + # rubocop:enable Layout/LineLength + end +end diff --git a/test/meta_test.rb b/test/meta_test.rb new file mode 100644 index 0000000..8c59e47 --- /dev/null +++ b/test/meta_test.rb @@ -0,0 +1,79 @@ +# frozen_string_literal: true + +require_relative 'test_helper' + +class MetaIpRanges + include Legitbot::MetaIpRanges +end + +class MetaTest < Minitest::Test + def test_fetch_ips + # NOTE: network call + ip_ranges = MetaIpRanges.new.fetch_ip_ranges_orig + + refute_nil ip_ranges + assert_kind_of Array, ip_ranges + refute_empty ip_ranges + end + + def test_valid_ip + ip = '69.63.186.89' + match = Legitbot::Meta.new(ip) + + assert_predicate match, :valid? + + ip = '69.171.251.1' + match = Legitbot::Meta.new(ip) + + assert_predicate match, :valid? + end + + def test_invalid_ip + ip = '127.0.0.1' + match = Legitbot::Meta.new(ip) + + assert_predicate match, :fake? + end + + def test_user_agent1 + Legitbot.bot( + 'meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)', + '31.13.76.56' + ) do |bot| + assert_equal :meta, bot.detected_as + assert_predicate bot, :valid? + end + end + + def test_user_agent2 + Legitbot.bot( + 'meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)', + '173.252.87.8' + ) do |bot| + assert_equal :meta, bot.detected_as + assert_predicate bot, :valid? + end + end + + def test_user_agent3 + Legitbot.bot( + 'meta-externalfetcher/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)', + '173.252.87.8' + ) do |bot| + assert_equal :meta, bot.detected_as + assert_predicate bot, :valid? + end + end + + # rubocop:disable Layout/LineLength + def test_user_agent4 + Legitbot.bot( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.4 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.4 meta-externalagent/1.1 Twitterbot/1.0', + '92.243.181.7' + ) do |bot| + assert_includes %i[meta twitter], bot.detected_as + assert_predicate bot, :fake? + end + end + # rubocop:enable Layout/LineLength +end diff --git a/test/test_helper.rb b/test/test_helper.rb index caf370a..668f635 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -13,3 +13,4 @@ require 'minitest/autorun' require 'minitest/hooks/test' require 'lib/dns_server_mock' +require 'lib/meta_ip_ranges_mock'