summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Roach <greg@subaqua.co.uk>2025-07-23 15:10:47 +0100
committerGreg Roach <greg@subaqua.co.uk>2025-07-23 15:10:47 +0100
commit8ed218fd26ac2acc2ff1eebb3f14e46a8fba7b69 (patch)
treee86bfad5fc4f5d46b44f7c52d966dc33b6fa7823
parent5bcb6d8f8c7482574d4308c2ea87c44f35ebe415 (diff)
downloadwebtrees-8ed218fd26ac2acc2ff1eebb3f14e46a8fba7b69.tar.gz
webtrees-8ed218fd26ac2acc2ff1eebb3f14e46a8fba7b69.tar.bz2
webtrees-8ed218fd26ac2acc2ff1eebb3f14e46a8fba7b69.zip
Improve robot handling
-rw-r--r--app/Http/Middleware/BadBotBlocker.php268
-rw-r--r--app/Http/RequestHandlers/RobotsTxt.php10
-rw-r--r--app/Http/Routes/WebRoutes.php22
3 files changed, 166 insertions, 134 deletions
diff --git a/app/Http/Middleware/BadBotBlocker.php b/app/Http/Middleware/BadBotBlocker.php
index 7b560495b5..ced1854491 100644
--- a/app/Http/Middleware/BadBotBlocker.php
+++ b/app/Http/Middleware/BadBotBlocker.php
@@ -47,110 +47,134 @@ use function str_ends_with;
*/
class BadBotBlocker implements MiddlewareInterface
{
+ public const ROBOT_ATTRIBUTE_NAME = 'is-a-robot';
+
// Cache whois requests. Try to avoid all caches expiring at the same time.
private const WHOIS_TTL_MIN = 28 * 86400;
private const WHOIS_TTL_MAX = 35 * 86400;
- /**
- * @see https://github.com/ai-robots-txt/ai.robots.txt for a list of AI crawlers.
- * We can't load this repository as a dependency as it's not a package.
- * Instead, the list from version 1.26 is copied here.
- */
- public const AI_ROBOTS = [
- 'AI2Bot',
- 'Ai2Bot-Dolma',
- 'Amazonbot',
- 'anthropic-ai',
- 'Applebot',
- 'Applebot-Extended',
- 'Brightbot 1.0',
- 'Bytespider',
- 'CCBot',
- 'ChatGPT-User',
- 'Claude-Web',
- 'ClaudeBot',
- 'cohere-ai',
- 'cohere-training-data-crawler',
- 'Crawlspace',
- 'Diffbot',
- 'DuckAssistBot',
- 'FacebookBot',
- 'FriendlyCrawler',
- 'Google-Extended',
- 'GoogleOther',
- 'GoogleOther-Image',
- 'GoogleOther-Video',
- 'GPTBot',
- 'iaskspider/2.0',
- 'ICC-Crawler',
- 'ImagesiftBot',
- 'img2dataset',
- 'ISSCyberRiskCrawler',
- 'Kangaroo Bot',
- 'meta-externalagent',
- 'meta-externalfetcher',
- 'OAI-SearchBot',
- 'omgili',
- 'omgilibot',
- 'PanguBot',
- 'PerplexityBot',
- 'PetalBot',
- 'Scrapy',
- 'SemrushBot-OCOB',
- 'SemrushBot-SWA',
- 'Sidetrade indexer bot',
- 'Timpibot',
- 'VelenPublicWebCrawler',
- 'Webzio-Extended',
- 'YouBot',
- ];
-
- // Other bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt.
+ // An opinionated list of "bad" robots. Typically, these are AI and SEO crawlers.
public const BAD_ROBOTS = [
- 'admantx',
+ 'ADmantX',
+ 'AI2Bot',
'Adsbot',
+ 'AISearchBot',
'AhrefsBot',
+ 'Ai2Bot-Dolma',
'AliyunSecBot',
- 'AntBot', // Aggressive crawler
+ 'Amazonbot',
+ 'Andibot',
+ 'AntBot',
+ 'Applebot',
'AspiegelBot',
- 'Awario', // Brand management
- 'Barkrowler', // Crawler for babbar.tech
+ 'Awario',
'BLEXBot',
- 'CensysInspect', // Vulnerability scanner
- 'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
+ 'Barkrowler',
+ 'Brightbot',
+ 'Bytespider',
+ 'CCBot',
+ 'CensysInspect',
+ 'ChatGPT-User',
+ 'Claude-SearchBot',
+ 'Claude-User',
+ 'Claude-Web',
+ 'ClaudeBot',
+ 'Cotoyogi',
+ 'Crawlspace',
+ 'DataForSeoBot',
+ 'Datenbank Crawler',
+ 'Devin',
+ 'Diffbot',
'DotBot',
- 'Expanse', // Another pointless crawler
- 'fidget-spinner-bot', // Agressive crawler
- 'Foregenix', // Vulnerability scanner
- 'Go-http-client', // Crawler library used by many bots
+ 'DuckAssistBot',
+ 'Echobot Bot',
+ 'EchoboxBot',
+ 'Expanse',
+ 'FacebookBot',
+ 'Factset_spyderbot',
+ 'FirecrawlAgent',
+ 'Foregenix',
+ 'FriendlyCrawler',
+ 'GPTBot',
+ 'Gemini-Deep-Research',
+ 'Go-http-client',
+ 'Google-CloudVertexBot',
+ 'Google-Extended',
+ 'GoogleAgent-Mariner',
+ 'GoogleOther',
'Grapeshot',
- 'Honolulu-bot', // Aggressive crawer, no info available
- 'ia_archiver',
- 'internet-measurement', // Driftnet
+ 'Honolulu-bot',
+ 'ICC-Crawler',
+ 'ISSCyberRiskCrawler',
+ 'ImagesiftBot',
'IonCrawl',
- 'Java', // Crawler library used by many bots
- 'linabot', // Aggressive crawer, no info available
+ 'Java',
+ 'Kangaroo Bot',
'Linguee',
- 'MegaIndex.ru',
'MJ12bot',
- 'netEstate NE',
- 'panscient',
- 'phxbot', // Badly written crawler
- 'proximic',
- 'python-requests', // Crawler library used by many bots
- 'Scrapy', // Crawler library used by many bots
- 'SeekportBot', // Pretends to be a search engine - but isn't
- 'SemrushBot',
- 'serpstatbot',
+ 'MegaIndex.ru',
+ 'Meta-ExternalAgent',
+ 'Meta-ExternalFetcher',
+ 'MistralAI-User',
+ 'MyCentralAIScraperBot',
+ 'NovaAct',
+ 'OAI-SearchBot',
+ 'Operator',
+ 'PanguBot',
+ 'Panscient',
+ 'Perplexity-User',
+ 'PerplexityBot',
+ 'PetalBot',
+ 'PhindBot',
+ 'Poseidon Research Crawler',
+ 'QualifiedBot',
+ 'QuillBot',
+ 'SBIntuitionsBot',
'SEOkicks',
+ 'Scrapy',
+ 'SeekportBot',
+ 'SemrushBot',
+ 'Sidetrade indexer bot',
'SiteKiosk',
- 'test-bot', // Agressive crawler
+ 'SummalyBot',
+ 'Thinkbot',
+ 'TikTokSpider',
+ 'Timpibot',
'TinyTestBot',
'Turnitin',
- 'wp_is_mobile', // Nothing to do with wordpress
+ 'VelenPublicWebCrawler',
+ 'WARDBot',
+ 'Webzio-Extended',
'XoviBot',
+ 'YandexAdditional',
'YisouSpider',
+ 'YouBot',
'ZoominfoBot',
+ 'aiHitBot',
+ 'aiohttp',
+ 'anthropic-ai',
+ 'bedrockbot',
+ 'cohere-ai',
+ 'cohere-training-data-crawler',
+ 'facebookexternalhit',
+ 'fidget-spinner-bot',
+ 'iaskspider',
+ 'img2dataset',
+ 'internet-measurement',
+ 'linabot',
+ 'meta-externalagent',
+ 'meta-externalfetcher',
+ 'netEstate',
+ 'omgili',
+ 'panscient',
+ 'phxbot',
+ 'proximic',
+ 'python-requests',
+ 'quillbot.com',
+ 'wpbot',
+ 'serpstatbot',
+ 'test-bot',
+ 'wp_is_mobile',
];
/**
@@ -192,24 +216,9 @@ class BadBotBlocker implements MiddlewareInterface
/**
* Some search engines operate from designated IP addresses.
- *
- * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
+ * TODO: fetch current lists of IPs, rather than use hard-coded values.
+ * See https://merj.com/blog/dont-block-what-you-want-duckduckgo-and-common-crawl-to-provide-ip-address-api-endpoints
*/
- private const ROBOT_IPS = [
- 'DuckDuckBot' => [
- '23.21.227.69',
- '50.16.241.113',
- '50.16.241.114',
- '50.16.241.117',
- '50.16.247.234',
- '52.204.97.54',
- '52.5.190.19',
- '54.197.234.188',
- '54.208.100.253',
- '54.208.102.37',
- '107.21.1.8',
- ],
- ];
/**
* Some search engines operate from within a designated autonomous system.
@@ -236,11 +245,13 @@ class BadBotBlocker implements MiddlewareInterface
$address = Factory::parseAddressString($ip);
assert($address instanceof AddressInterface);
- foreach ([self::AI_ROBOTS, self::BAD_ROBOTS] as $robots) {
- foreach ($robots as $robot) {
- if (str_contains($ua, $robot)) {
- return $this->response();
- }
+ if ($ua === '') {
+ return $this->response();
+ }
+
+ foreach (self::BAD_ROBOTS as $robot) {
+ if (str_contains($ua, $robot)) {
+ return $this->response();
}
}
@@ -256,19 +267,7 @@ class BadBotBlocker implements MiddlewareInterface
}
}
- foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
- if (str_contains($ua, $robot)) {
- foreach ($valid_ip_ranges as $ip_range) {
- $range = Factory::parseRangeString($ip_range);
-
- if ($range instanceof RangeInterface && $range->contains($address)) {
- continue 2;
- }
- }
-
- return $this->response();
- }
- }
+ // TODO: fetch current lists of IPs, rather than use hard-coded values.
foreach (self::ROBOT_ASNS as $robot => $asns) {
foreach ($asns as $asn) {
@@ -296,6 +295,37 @@ class BadBotBlocker implements MiddlewareInterface
}
}
+ // No Cookies? Few headers? Probably a robot.
+ if ($request->getCookieParams() === [] && count($request->getHeaders()) <= 10) {
+ // Claims to be a browser?
+ if (preg_match('~^Mozilla/5.0 \(.*\) AppleWebKit/[0-9.]+ \(KHTML, like Gecko\) Chrome/[0-9.]+ Safari/[0-9.]+$~', $ua) === 1) {
+ // Prove it by setting a cookie
+ $content =
+ '<!DOCTYPE html>' .
+ '<html lang="en">' .
+ '<head>' .
+ '<meta charset="utf-8">' .
+ '<title>Cookie check</title>' .
+ '<meta http-equiv="refresh" content="0">' .
+ '</head>' .
+ '<body>Cookie check</body>' .
+ '</html>';
+
+ return response($content)->withHeader('set-cookie', 'x=y; HttpOnly; SameSite=Strict');
+ }
+
+ $request = $request->withAttribute(self::ROBOT_ATTRIBUTE_NAME, true);
+ }
+
+ // Scans for WordPress vulnerabilities?
+ // Block these before wasting resources on DB connections, sessions, etc.
+ $path = $request->getUri()->getPath();
+
+ if (str_starts_with($path, '/xmlrpc.php') || str_starts_with($path, '/wp-')) {
+ return $this->response();
+ }
+ $request = $request->withAttribute(self::ROBOT_ATTRIBUTE_NAME, true);
+
return $handler->handle($request);
}
@@ -335,8 +365,8 @@ class BadBotBlocker implements MiddlewareInterface
}, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
}
- private function response(): ResponseInterface
+ private function response(string $content = 'Not acceptable'): ResponseInterface
{
- return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
+ return response($content, StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
}
}
diff --git a/app/Http/RequestHandlers/RobotsTxt.php b/app/Http/RequestHandlers/RobotsTxt.php
index 47f22c6735..9fe68d6f6f 100644
--- a/app/Http/RequestHandlers/RobotsTxt.php
+++ b/app/Http/RequestHandlers/RobotsTxt.php
@@ -52,27 +52,19 @@ class RobotsTxt implements RequestHandlerInterface
private TreeService $tree_service;
- /**
- * @param ModuleService $module_service
- */
public function __construct(ModuleService $module_service, TreeService $tree_service)
{
$this->module_service = $module_service;
$this->tree_service = $tree_service;
}
- /**
- * @param ServerRequestInterface $request
- *
- * @return ResponseInterface
- */
public function handle(ServerRequestInterface $request): ResponseInterface
{
$base_url = Validator::attributes($request)->string('base_url');
$trees = $this->tree_service->all()->map(static fn (Tree $tree): string => $tree->name());
$data = [
- 'bad_user_agents' => [...BadBotBlocker::AI_ROBOTS, ...BadBotBlocker::BAD_ROBOTS],
+ 'bad_user_agents' => BadBotBlocker::BAD_ROBOTS,
'base_url' => $base_url,
'base_path' => parse_url($base_url, PHP_URL_PATH) ?? '',
'disallowed_paths' => self::DISALLOWED_PATHS,
diff --git a/app/Http/Routes/WebRoutes.php b/app/Http/Routes/WebRoutes.php
index c402fc96ff..4c2713aee2 100644
--- a/app/Http/Routes/WebRoutes.php
+++ b/app/Http/Routes/WebRoutes.php
@@ -27,6 +27,7 @@ use Fisharebest\Webtrees\Http\Middleware\AuthEditor;
use Fisharebest\Webtrees\Http\Middleware\AuthLoggedIn;
use Fisharebest\Webtrees\Http\Middleware\AuthManager;
use Fisharebest\Webtrees\Http\Middleware\AuthModerator;
+use Fisharebest\Webtrees\Http\Middleware\AuthNotRobot;
use Fisharebest\Webtrees\Http\RequestHandlers\AccountDelete;
use Fisharebest\Webtrees\Http\RequestHandlers\AccountEdit;
use Fisharebest\Webtrees\Http\RequestHandlers\AccountUpdate;
@@ -663,11 +664,7 @@ class WebRoutes
// Visitor routes with a tree.
$router->attach('', '/tree/{tree}', static function (Map $router) {
- $router->get(TreePage::class, '');
$router->get(AutoCompleteSurname::class, '/autocomplete/surname');
- $router->get(CalendarPage::class, '/calendar/{view}');
- $router->post(CalendarAction::class, '/calendar/{view}');
- $router->get(CalendarEvents::class, '/calendar-events/{view}');
$router->get(ContactPage::class, '/contact');
$router->post(ContactAction::class, '/contact');
$router->get(FamilyPage::class, '/family/{xref}{/slug}')->tokens(['slug' => '.*']);
@@ -707,9 +704,22 @@ class WebRoutes
$router->get(TomSelectSubmission::class, '/tom-select-submission');
$router->get(TomSelectSubmitter::class, '/tom-select-submitter');
$router->get(TomSelectRepository::class, '/tom-select-repository');
+ $router->get(TreePage::class, '');
$router->get(TreePageBlock::class, '/tree-page-block');
- $router->get('example', '/…')
- ->isRoutable(false);
+ $router->get('example', '/…')->isRoutable(false);
+ });
+
+ // Visitor routes with a tree (robots not allowed).
+ $router->attach('', '/tree/{tree}', static function (Map $router) {
+ $router->extras([
+ 'middleware' => [
+ AuthNotRobot::class,
+ ],
+ ]);
+
+ $router->get(CalendarPage::class, '/calendar/{view}');
+ $router->post(CalendarAction::class, '/calendar/{view}');
+ $router->get(CalendarEvents::class, '/calendar-events/{view}');
});
// Match module routes, with and without a tree.