Skip to content

Commit

Permalink
✨ more Meta crawlers (#153)
Browse files Browse the repository at this point in the history
`Facebook` matcher has been split into two –

* `Facebook` to detect Meta crawlers with the prefix `facebook`;
* and `Meta` for crawlers with the prefix `meta`.

They both share the same code to fetch IP ranges.
  • Loading branch information
alaz authored Aug 31, 2024
1 parent a4d24a1 commit 3e1a4f5
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ end
- [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973)
- [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/)
- [DuckDuckGo bot](https://duckduckgo.com/duckduckbot)
- [Facebook crawler](https://developers.facebook.com/docs/sharing/webmasters/crawler)
- [Google crawlers](https://support.google.com/webmasters/answer/1061943)
- [IAS](https://integralads.com/ias-privacy-data-management/policies/site-indexing-policy/)
- [OpenAI GPTBot](https://platform.openai.com/docs/gptbot)
- [Oracle Data Cloud Crawler](https://www.oracle.com/corporate/acquisitions/grapeshot/crawler.html)
- [Marginalia](https://www.marginalia.nu/marginalia-search/for-webmasters/)
- [Meta / Facebook Web crawlers](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/)
- [Petal search engine](http://aspiegel.com/petalbot)
- [Pinterest](https://help.pinterest.com/en/articles/about-pinterest-crawler-0)
- [Twitterbot](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started),
Expand Down
1 change: 1 addition & 0 deletions lib/legitbot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
require_relative 'legitbot/ias'
require_relative 'legitbot/oracle'
require_relative 'legitbot/marginalia'
require_relative 'legitbot/meta'
require_relative 'legitbot/petalbot'
require_relative 'legitbot/pinterest'
require_relative 'legitbot/twitter'
Expand Down
12 changes: 3 additions & 9 deletions lib/legitbot/facebook.rb
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
# frozen_string_literal: true

require 'irrc'
require_relative 'meta'

module Legitbot # :nodoc:
# https://developers.facebook.com/docs/sharing/webmasters/crawler
class Facebook < BotMatch
AS = 'AS32934'
extend MetaIpRanges

ip_ranges do
client = Irrc::Client.new
client.query :radb, AS, source: :radb
results = client.perform

%i[ipv4 ipv6].map do |family|
results[AS][family][AS]
end.flatten
fetch_ip_ranges
end
end

Expand Down
33 changes: 33 additions & 0 deletions lib/legitbot/meta.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

require 'irrc'

module Legitbot # :nodoc:
module MetaIpRanges # :nodoc:
AS = 'AS32934'

def fetch_ip_ranges
client = Irrc::Client.new
client.query :radb, AS, source: :radb
results = client.perform

%i[ipv4 ipv6].map do |family|
results[AS][family][AS]
end.flatten
end
end

# https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/
class Meta < BotMatch
extend MetaIpRanges

ip_ranges do
fetch_ip_ranges
end
end

rule Legitbot::Meta, %w[
meta-externalagent
meta-externalfetcher
]
end
13 changes: 0 additions & 13 deletions test/facebook_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,6 @@

require_relative 'test_helper'

module Legitbot
class Facebook
# rubocop:disable Layout/LineLength
def self.whois
{
ipv4: ['69.63.176.0/20', '66.220.144.0/20', '66.220.144.0/21', '69.63.184.0/21', '69.63.176.0/21', '74.119.76.0/22', '69.171.255.0/24', '173.252.64.0/18', '69.171.224.0/19', '69.171.224.0/20', '103.4.96.0/22', '69.63.176.0/24', '173.252.64.0/19', '173.252.70.0/24', '31.13.64.0/18', '31.13.24.0/21', '66.220.152.0/21', '66.220.159.0/24', '69.171.239.0/24', '69.171.240.0/20', '31.13.64.0/19', '31.13.64.0/24', '31.13.65.0/24', '31.13.67.0/24', '31.13.68.0/24', '31.13.69.0/24', '31.13.70.0/24', '31.13.71.0/24', '31.13.72.0/24', '31.13.73.0/24', '31.13.74.0/24', '31.13.75.0/24', '31.13.76.0/24', '31.13.77.0/24', '31.13.96.0/19', '31.13.66.0/24', '173.252.96.0/19', '69.63.178.0/24', '31.13.78.0/24', '31.13.79.0/24', '31.13.80.0/24', '31.13.82.0/24', '31.13.83.0/24', '31.13.84.0/24', '31.13.85.0/24', '31.13.86.0/24', '31.13.87.0/24', '31.13.88.0/24', '31.13.89.0/24', '31.13.90.0/24', '31.13.91.0/24', '31.13.92.0/24', '31.13.93.0/24', '31.13.94.0/24', '31.13.95.0/24', '69.171.253.0/24', '69.63.186.0/24', '31.13.81.0/24', '179.60.192.0/22', '179.60.192.0/24', '179.60.193.0/24', '179.60.194.0/24', '179.60.195.0/24', '185.60.216.0/22', '45.64.40.0/22', '185.60.216.0/24', '185.60.217.0/24', '185.60.218.0/24', '185.60.219.0/24', '129.134.0.0/16', '157.240.0.0/16', '157.240.8.0/24', '157.240.0.0/24', '157.240.1.0/24', '157.240.2.0/24', '157.240.3.0/24', '157.240.4.0/24', '157.240.5.0/24', '157.240.6.0/24', '157.240.7.0/24', '157.240.9.0/24', '157.240.10.0/24', '157.240.16.0/24', '157.240.19.0/24', '157.240.11.0/24', '157.240.12.0/24', '157.240.13.0/24', '157.240.14.0/24', '157.240.15.0/24', '157.240.17.0/24', '157.240.18.0/24', '157.240.20.0/24', '157.240.21.0/24', '157.240.22.0/24', '157.240.23.0/24', '157.240.0.0/17', '69.171.250.0/24', '157.240.24.0/24', '157.240.25.0/24', '199.201.64.0/24', '199.201.65.0/24', '199.201.64.0/22', '204.15.20.0/22', '157.240.192.0/24', '129.134.0.0/17', '157.240.198.0/24'],
ipv6: []
}
end
# rubocop:enable Layout/LineLength
end
end

class FacebookTest < Minitest::Test
def test_valid_ip
ip = '69.63.186.89'
Expand Down
13 changes: 13 additions & 0 deletions test/lib/meta_ip_ranges_mock.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# frozen_string_literal: true

module Legitbot
module MetaIpRanges
alias fetch_ip_ranges_orig fetch_ip_ranges

# rubocop:disable Layout/LineLength
def fetch_ip_ranges
['69.63.176.0/20', '66.220.144.0/20', '66.220.144.0/21', '69.63.184.0/21', '69.63.176.0/21', '74.119.76.0/22', '69.171.255.0/24', '173.252.64.0/18', '69.171.224.0/19', '69.171.224.0/20', '103.4.96.0/22', '69.63.176.0/24', '173.252.64.0/19', '173.252.70.0/24', '31.13.64.0/18', '31.13.24.0/21', '66.220.152.0/21', '66.220.159.0/24', '69.171.239.0/24', '69.171.240.0/20', '31.13.64.0/19', '31.13.64.0/24', '31.13.65.0/24', '31.13.67.0/24', '31.13.68.0/24', '31.13.69.0/24', '31.13.70.0/24', '31.13.71.0/24', '31.13.72.0/24', '31.13.73.0/24', '31.13.74.0/24', '31.13.75.0/24', '31.13.76.0/24', '31.13.77.0/24', '31.13.96.0/19', '31.13.66.0/24', '173.252.96.0/19', '69.63.178.0/24', '31.13.78.0/24', '31.13.79.0/24', '31.13.80.0/24', '31.13.82.0/24', '31.13.83.0/24', '31.13.84.0/24', '31.13.85.0/24', '31.13.86.0/24', '31.13.87.0/24', '31.13.88.0/24', '31.13.89.0/24', '31.13.90.0/24', '31.13.91.0/24', '31.13.92.0/24', '31.13.93.0/24', '31.13.94.0/24', '31.13.95.0/24', '69.171.253.0/24', '69.63.186.0/24', '31.13.81.0/24', '179.60.192.0/22', '179.60.192.0/24', '179.60.193.0/24', '179.60.194.0/24', '179.60.195.0/24', '185.60.216.0/22', '45.64.40.0/22', '185.60.216.0/24', '185.60.217.0/24', '185.60.218.0/24', '185.60.219.0/24', '129.134.0.0/16', '157.240.0.0/16', '157.240.8.0/24', '157.240.0.0/24', '157.240.1.0/24', '157.240.2.0/24', '157.240.3.0/24', '157.240.4.0/24', '157.240.5.0/24', '157.240.6.0/24', '157.240.7.0/24', '157.240.9.0/24', '157.240.10.0/24', '157.240.16.0/24', '157.240.19.0/24', '157.240.11.0/24', '157.240.12.0/24', '157.240.13.0/24', '157.240.14.0/24', '157.240.15.0/24', '157.240.17.0/24', '157.240.18.0/24', '157.240.20.0/24', '157.240.21.0/24', '157.240.22.0/24', '157.240.23.0/24', '157.240.0.0/17', '69.171.250.0/24', '157.240.24.0/24', '157.240.25.0/24', '199.201.64.0/24', '199.201.65.0/24', '199.201.64.0/22', '204.15.20.0/22', '157.240.192.0/24', '129.134.0.0/17', '157.240.198.0/24']
end
# rubocop:enable Layout/LineLength
end
end
79 changes: 79 additions & 0 deletions test/meta_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# frozen_string_literal: true

require_relative 'test_helper'

class MetaIpRanges
include Legitbot::MetaIpRanges
end

class MetaTest < Minitest::Test
def test_fetch_ips
# NOTE: network call
ip_ranges = MetaIpRanges.new.fetch_ip_ranges_orig

refute_nil ip_ranges
assert_kind_of Array, ip_ranges
refute_empty ip_ranges
end

def test_valid_ip
ip = '69.63.186.89'
match = Legitbot::Meta.new(ip)

assert_predicate match, :valid?

ip = '69.171.251.1'
match = Legitbot::Meta.new(ip)

assert_predicate match, :valid?
end

def test_invalid_ip
ip = '127.0.0.1'
match = Legitbot::Meta.new(ip)

assert_predicate match, :fake?
end

def test_user_agent1
Legitbot.bot(
'meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)',
'31.13.76.56'
) do |bot|
assert_equal :meta, bot.detected_as
assert_predicate bot, :valid?
end
end

def test_user_agent2
Legitbot.bot(
'meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)',
'173.252.87.8'
) do |bot|
assert_equal :meta, bot.detected_as
assert_predicate bot, :valid?
end
end

def test_user_agent3
Legitbot.bot(
'meta-externalfetcher/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)',
'173.252.87.8'
) do |bot|
assert_equal :meta, bot.detected_as
assert_predicate bot, :valid?
end
end

# rubocop:disable Layout/LineLength
def test_user_agent4
Legitbot.bot(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.4 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.4 meta-externalagent/1.1 Twitterbot/1.0',
'92.243.181.7'
) do |bot|
assert_includes %i[meta twitter], bot.detected_as
assert_predicate bot, :fake?
end
end
# rubocop:enable Layout/LineLength
end
1 change: 1 addition & 0 deletions test/test_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@
require 'minitest/autorun'
require 'minitest/hooks/test'
require 'lib/dns_server_mock'
require 'lib/meta_ip_ranges_mock'

0 comments on commit 3e1a4f5

Please sign in to comment.