From 1e977978e172bdcac1d5d6437842a129a0233440 Mon Sep 17 00:00:00 2001
From: Kristian Klausen <kristian@klausen.dk>
Date: Fri, 2 Apr 2021 22:01:43 +0200
Subject: [PATCH] Implement downloading a range of tasks (the range is computed
 automatically)

---
 .gitlab-ci.yml | 10 +++++++---
 README.md      | 14 ++++++++++----
 snapshotter.sh | 25 ++++++++++++++++++++++---
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b68f5c4d4e..81cf6f5d1c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,11 +18,15 @@ snapshot:
     - git fetch origin +refs/heads/snapshots:refs/remotes/origin/snapshots || true
   script:
     - git worktree add snapshots HEAD
-    - (cd snapshots && git checkout snapshots || git checkout --orphan snapshots && git rm -fr .)
-    - ./snapshotter.sh 10000 false false snapshots
+    - (cd snapshots && git checkout snapshots || (git checkout --orphan snapshots && git rm -fr .))
+    - mkdir snapshot
+    - ALBS_RANGE_DOWNLOAD_ENABLED=true ALBS_RANGE_DOWNLOAD_CHUNK="$(git rev-list --count origin/snapshots)" ./snapshotter.sh -1 false false snapshot
+    - rsync --recursive snapshot/ snapshots/
     - cd snapshots
+    - range="$(<range)"
+    - rm range
     - git add --all
-    - git commit --allow-empty --message="Snapshot $(date --iso-8601=minute)"
+    - 'git commit --allow-empty --message="Snapshot $(date --iso-8601=minute), tasks ${range}"'
     - git push --set-upstream origin snapshots
 
 pages:
diff --git a/README.md b/README.md
index 0f1281e2a9..df1976e777 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,13 @@ $ ./snapshotter.sh [maximum number of tasks to download] [download attachment: t
 ## How It Works
 
 1. [`https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01`](https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01) is scrapped to get the newest task id
-2. A list of URLs is generated: `https://bugs.archlinux.org/task/{1..$NEWEST_TASK_ID}`
-3. `wget` starts downloading the URLs, including page requisites and linked user pages
-4. `xsltproc` is run on all the HTML files to cleanup the html (remove navbar entries, login form etc.)
-5. `prettier` is run on all the HTML files to prettify the HTML (primarily fixing indentation)
+2. The range of tasks to download is decided:
+   * If `$ALBS_RANGE_DOWNLOAD_ENABLED = true` then:
+     * A range of tasks is computed based on `$ALBS_RANGE_DOWNLOAD_CHUNK` and `$ALBS_RANGE_DOWNLOAD_CHUNKS`
+   * else:
+     * `$min=0`
+     * `$max=$new_task_id`
+4. A list of URLs is generated: `https://bugs.archlinux.org/task/{$min..$max}`
+5. `wget` starts downloading the URLs, including page requisites and linked user pages
+6. `xsltproc` is run on all the HTML files to cleanup the html (remove navbar entries, login form etc.)
+7. `prettier` is run on all the HTML files to prettify the HTML (primarily fixing indentation)
diff --git a/snapshotter.sh b/snapshotter.sh
index 6331dfaa77..bfd9e7ee64 100755
--- a/snapshotter.sh
+++ b/snapshotter.sh
@@ -1,13 +1,17 @@
 #!/bin/bash
 set -o nounset -o errexit -o pipefail
 ALBS_PARALLELIZE_DOWNLOAD="${ALBS_PARALLELIZE_DOWNLOAD:-false}"
+ALBS_RANGE_DOWNLOAD_ENABLED="${ALBS_RANGE_DOWNLOAD_ENABLED:-false}"
+ALBS_RANGE_DOWNLOAD_CHUNK="${ALBS_RANGE_DOWNLOAD_CHUNK:-$(date +%-d)}"
+ALBS_RANGE_DOWNLOAD_CHUNKS="${ALBS_RANGE_DOWNLOAD_CHUNKS:-10}"
+ALBS_RANGE_DOWNLOAD_CHUNK="$((ALBS_RANGE_DOWNLOAD_CHUNK % ALBS_RANGE_DOWNLOAD_CHUNKS + 1))"
 
 function get_newest_task_id() {
   wget --output-document=- "https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01" | grep --perl-regexp --only-matching --max-count 1 "(?<=task)[0-9]+"
 }
 
 function generate_urls() {
-  eval "echo https://bugs.archlinux.org/task/{1..$1} | tr ' ' '\n'"
+  eval "echo https://bugs.archlinux.org/task/{$1..$2} | tr ' ' '\n'"
 }
 
 function _download() {
@@ -71,13 +75,27 @@ function prettify() {
 
 function main() {
   local newest_task_id
-  if [ -n "${1:-}" ]; then
+  if [ -n "${1:-}" ] && [ "${1}" != -1 ]; then
     newest_task_id="${1}"
   else
     newest_task_id="$(get_newest_task_id)"
   fi
+  local min=1
+  local max=${newest_task_id}
+
+  if [[ ${ALBS_RANGE_DOWNLOAD_ENABLED} = true ]]; then
+    local max_tasks="$(( newest_task_id / ALBS_RANGE_DOWNLOAD_CHUNKS ))"
+    local overlap="$(( max_tasks * 5 / 100))" # 5% overlap
+
+    min="$(( (ALBS_RANGE_DOWNLOAD_CHUNK - 1) * max_tasks))"
+    max="$((ALBS_RANGE_DOWNLOAD_CHUNK * max_tasks + overlap))"
+    if (( newest_task_id < max )); then
+      max="${newest_task_id}"
+    fi
+  fi
+
   local urls
-  urls="$(generate_urls "${newest_task_id}")"
+  urls="$(generate_urls "${min}" "${max}")"
   local orig_pwd="${PWD}"
   local snapshot_dir=""
   if [ -z "${4:-}" ]; then
@@ -87,6 +105,7 @@ function main() {
     snapshot_dir="${4}"
   fi
   cd "${snapshot_dir}"
+  echo "${min}-${max}" > range
   download "${2:-true}" "${urls}"
   cleanup_html "${orig_pwd}"
   if [ "${3:-true}" = "true" ]; then
-- 
GitLab