From 9aefd872582254b7bb5e2cbb1e716c8948dcf0d2 Mon Sep 17 00:00:00 2001
From: Kristian Klausen <kristian@klausen.dk>
Date: Mon, 22 Jul 2024 21:05:03 +0200
Subject: [PATCH] archwiki: Add simple challenge for Chinese IP addresses

The wiki has been hammered with requests from some stupid Chinese
bots/crawlers. Adding a simple challenge (requiring a cookie to be set),
seems to be enough to throw them off.

This was initially added for all pages, but as that could affect Chinese
search engines (concern raised on the forum[1]), it was changed to only
affect "action views", which search engines are not supposed to crawl.

[1] https://bbs.archlinux.org/viewtopic.php?pid=2185963#p2185963
---
 host_vars/wiki.archlinux.org/misc        |  2 ++
 roles/archwiki/defaults/main.yml         |  1 +
 roles/archwiki/templates/nginx.d.conf.j2 | 31 ++++++++++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/host_vars/wiki.archlinux.org/misc b/host_vars/wiki.archlinux.org/misc
index 6dfb7cfbf..728880621 100644
--- a/host_vars/wiki.archlinux.org/misc
+++ b/host_vars/wiki.archlinux.org/misc
@@ -2,3 +2,5 @@ filesystem: btrfs
 memcached_socket: "/run/memcached/archwiki.sock"
 wireguard_address: 10.0.0.22
 wireguard_public_key: bZeNWMLtyNDaFR7jjWr06nNZt/vV/OKNleV7XZZs+lc=
+nginx_extra_modules:
+  - name: geoip2
diff --git a/roles/archwiki/defaults/main.yml b/roles/archwiki/defaults/main.yml
index 6813ba68f..72bb56bb9 100644
--- a/roles/archwiki/defaults/main.yml
+++ b/roles/archwiki/defaults/main.yml
@@ -1,6 +1,7 @@
 archwiki_dir: '/srv/http/archwiki'
 archwiki_domain: 'wiki.archlinux.org'
 archwiki_nginx_conf: '/etc/nginx/nginx.d/archwiki.conf'
+archwiki_nginx_challenge_value: '41ce6c6'
 archwiki_user: 'archwiki'
 archwiki_repository: 'https://gitlab.archlinux.org/archlinux/archwiki.git'
 archwiki_version: '1.42.1-2'
diff --git a/roles/archwiki/templates/nginx.d.conf.j2 b/roles/archwiki/templates/nginx.d.conf.j2
index 6003fd321..86bc74263 100644
--- a/roles/archwiki/templates/nginx.d.conf.j2
+++ b/roles/archwiki/templates/nginx.d.conf.j2
@@ -13,6 +13,32 @@ upstream archwiki {
     server unix://{{ archwiki_socket }};
 }
 
+# Challenge the client if the cookie "challenge" is not set to
+# the value of "archwiki_nginx_challenge_value".
+map $cookie_challenge $challenge_required2 {
+    default 1;
+    {{ archwiki_nginx_challenge_value }} 0;
+}
+
+# Challenge the client if it is requesting an "action view" and
+# $challenge_required2 is true.
+map $request_uri $challenge_required {
+    default         0;
+    ~^/index\.php\? $challenge_required2;
+}
+
+geoip2 /var/lib/GeoIP/GeoLite2-Country.mmdb {
+    auto_reload 60m;
+    $geoip2_data_country_iso_code country iso_code;
+}
+
+# Challenge the client if it is from China and $challenge_required is
+# true. This is enough to "throw off" some bots/crawlers from China.
+map $geoip2_data_country_iso_code $challenge {
+    default 0;
+    CN      $challenge_required;
+}
+
 server {
     listen       80;
     listen       [::]:80;
@@ -103,6 +129,11 @@ server {
 
     # normal PHP FastCGI handler
     location ~ ^/[^/]+\.php$ {
+        if ($challenge) {
+            add_header Set-Cookie "challenge={{ archwiki_nginx_challenge_value }}; SameSite=Strict";
+            return 303 $scheme://$server_name/$request_uri;
+        }
+
         try_files $uri =404;
         access_log   /var/log/nginx/{{ archwiki_domain }}/access.log main;
         access_log   /var/log/nginx/{{ archwiki_domain }}/access.log.json json_main;
-- 
GitLab