From f562b4e9279c77ed0f91d448bd97f1166262ee85 Mon Sep 17 00:00:00 2001
From: Christian Heusel <christian@heusel.eu>
Date: Fri, 2 Aug 2024 15:12:44 +0200
Subject: [PATCH] aurweb: Create robots.txt

This should i.e. forbid crawlers to index all of the git diffs which
put's unneccessary load on the server and is not really of benefit to be
indexed anyways.

Link: https://gitlab.archlinux.org/archlinux/infrastructure/-/issues/610
Reviewed-by: Sven-Hendrik Haase <svenstaro@gmail.com>
Reviewed-by: Levente Polyak <anthraxx@archlinux.org>
Signed-off-by: Christian Heusel <christian@heusel.eu>
---
 roles/aurweb/files/robots.txt          | 12 ++++++++++++
 roles/aurweb/tasks/main.yml            |  3 +++
 roles/aurweb/templates/nginx.d.conf.j2 |  4 ++++
 3 files changed, 19 insertions(+)
 create mode 100644 roles/aurweb/files/robots.txt

diff --git a/roles/aurweb/files/robots.txt b/roles/aurweb/files/robots.txt
new file mode 100644
index 000000000..d87a40c8c
--- /dev/null
+++ b/roles/aurweb/files/robots.txt
@@ -0,0 +1,12 @@
+User-agent: *
+# block the search page from indexing, as the search is done via URL parameters
+Disallow: /packages?*
+# block all interactive things from being indexed, such as posting requests
+Disallow: /pkgbase/*
+# block all account pages from being indexed, as they require login anyways
+Disallow: /account/*
+# block the cgit interface except for the useful things
+Disallow: /cgit/aur.git/*
+Allow: /cgit/aur.git/tree
+Allow: /cgit/aur.git/log
+Crawl-delay: 2
diff --git a/roles/aurweb/tasks/main.yml b/roles/aurweb/tasks/main.yml
index e53c9e90d..2799ba975 100644
--- a/roles/aurweb/tasks/main.yml
+++ b/roles/aurweb/tasks/main.yml
@@ -109,6 +109,9 @@
 - name: Copy aurweb configuration file
   copy: src={{ aurweb_dir }}/conf/config.defaults dest={{ aurweb_conf_dir }}/config.defaults remote_src=yes owner=root group=root mode=0644
 
+- name: Configure robots.txt
+  copy: src=robots.txt dest="{{ aurweb_dir }}/robots.txt" owner=root group=root mode=0644
+
 - name: Install goaurrpc configuration
   template: src=goaurrpc.conf.j2 dest=/etc/goaurrpc.conf owner=root group=root mode=0644
 
diff --git a/roles/aurweb/templates/nginx.d.conf.j2 b/roles/aurweb/templates/nginx.d.conf.j2
index 547fe49a3..1373b6623 100644
--- a/roles/aurweb/templates/nginx.d.conf.j2
+++ b/roles/aurweb/templates/nginx.d.conf.j2
@@ -51,6 +51,10 @@ server {
     root    {{ aurweb_dir }}/static;
     index   index.php;
 
+    location = /robots.txt {
+        alias {{ aurweb_dir }}/robots.txt;
+    }
+
     # redirect /tu to /package-maintainer for external links
     location ~ ^/tu($|/.*) {
         return 301 https://aur.archlinux.org/package-maintainer$1;
-- 
GitLab