From f562b4e9279c77ed0f91d448bd97f1166262ee85 Mon Sep 17 00:00:00 2001 From: Christian Heusel <christian@heusel.eu> Date: Fri, 2 Aug 2024 15:12:44 +0200 Subject: [PATCH] aurweb: Create robots.txt This should i.e. forbid crawlers to index all of the git diffs which put's unneccessary load on the server and is not really of benefit to be indexed anyways. Link: https://gitlab.archlinux.org/archlinux/infrastructure/-/issues/610 Reviewed-by: Sven-Hendrik Haase <svenstaro@gmail.com> Reviewed-by: Levente Polyak <anthraxx@archlinux.org> Signed-off-by: Christian Heusel <christian@heusel.eu> --- roles/aurweb/files/robots.txt | 12 ++++++++++++ roles/aurweb/tasks/main.yml | 3 +++ roles/aurweb/templates/nginx.d.conf.j2 | 4 ++++ 3 files changed, 19 insertions(+) create mode 100644 roles/aurweb/files/robots.txt diff --git a/roles/aurweb/files/robots.txt b/roles/aurweb/files/robots.txt new file mode 100644 index 000000000..d87a40c8c --- /dev/null +++ b/roles/aurweb/files/robots.txt @@ -0,0 +1,12 @@ +User-agent: * +# block the search page from indexing, as the search is done via URL parameters +Disallow: /packages?* +# block all interactive things from being indexed, such as posting requests +Disallow: /pkgbase/* +# block all account pages from being indexed, as they require login anyways +Disallow: /account/* +# block the cgit interface except for the useful things +Disallow: /cgit/aur.git/* +Allow: /cgit/aur.git/tree +Allow: /cgit/aur.git/log +Crawl-delay: 2 diff --git a/roles/aurweb/tasks/main.yml b/roles/aurweb/tasks/main.yml index e53c9e90d..2799ba975 100644 --- a/roles/aurweb/tasks/main.yml +++ b/roles/aurweb/tasks/main.yml @@ -109,6 +109,9 @@ - name: Copy aurweb configuration file copy: src={{ aurweb_dir }}/conf/config.defaults dest={{ aurweb_conf_dir }}/config.defaults remote_src=yes owner=root group=root mode=0644 +- name: Configure robots.txt + copy: src=robots.txt dest="{{ aurweb_dir }}/robots.txt" owner=root group=root mode=0644 + - name: Install goaurrpc configuration template: src=goaurrpc.conf.j2 dest=/etc/goaurrpc.conf owner=root group=root mode=0644 diff --git a/roles/aurweb/templates/nginx.d.conf.j2 b/roles/aurweb/templates/nginx.d.conf.j2 index 547fe49a3..1373b6623 100644 --- a/roles/aurweb/templates/nginx.d.conf.j2 +++ b/roles/aurweb/templates/nginx.d.conf.j2 @@ -51,6 +51,10 @@ server { root {{ aurweb_dir }}/static; index index.php; + location = /robots.txt { + alias {{ aurweb_dir }}/robots.txt; + } + # redirect /tu to /package-maintainer for external links location ~ ^/tu($|/.*) { return 301 https://aur.archlinux.org/package-maintainer$1; -- GitLab