From 1e977978e172bdcac1d5d6437842a129a0233440 Mon Sep 17 00:00:00 2001 From: Kristian Klausen <kristian@klausen.dk> Date: Fri, 2 Apr 2021 22:01:43 +0200 Subject: [PATCH] Implement downloading a range of tasks (the range is computed automatically) --- .gitlab-ci.yml | 10 +++++++--- README.md | 14 ++++++++++---- snapshotter.sh | 25 ++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b68f5c4d4e..81cf6f5d1c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,11 +18,15 @@ snapshot: - git fetch origin +refs/heads/snapshots:refs/remotes/origin/snapshots || true script: - git worktree add snapshots HEAD - - (cd snapshots && git checkout snapshots || git checkout --orphan snapshots && git rm -fr .) - - ./snapshotter.sh 10000 false false snapshots + - (cd snapshots && git checkout snapshots || (git checkout --orphan snapshots && git rm -fr .)) + - mkdir snapshot + - ALBS_RANGE_DOWNLOAD_ENABLED=true ALBS_RANGE_DOWNLOAD_CHUNK="$(git rev-list --count origin/snapshots)" ./snapshotter.sh -1 false false snapshot + - rsync --recursive snapshot/ snapshots/ - cd snapshots + - range="$(<range)" + - rm range - git add --all - - git commit --allow-empty --message="Snapshot $(date --iso-8601=minute)" + - 'git commit --allow-empty --message="Snapshot $(date --iso-8601=minute), tasks ${range}"' - git push --set-upstream origin snapshots pages: diff --git a/README.md b/README.md index 0f1281e2a9..df1976e777 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,13 @@ $ ./snapshotter.sh [maximum number of tasks to download] [download attachment: t ## How It Works 1. [`https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01`](https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01) is scrapped to get the newest task id -2. A list of URLs is generated: `https://bugs.archlinux.org/task/{1..$NEWEST_TASK_ID}` -3. `wget` starts downloading the URLs, including page requisites and linked user pages -4. `xsltproc` is run on all the HTML files to cleanup the html (remove navbar entries, login form etc.) -5. `prettier` is run on all the HTML files to prettify the HTML (primarily fixing indentation) +2. The range of tasks to download is decided: + * If `$ALBS_RANGE_DOWNLOAD_ENABLED = true` then: + * A range of tasks is computed based on `$ALBS_RANGE_DOWNLOAD_CHUNK` and `$ALBS_RANGE_DOWNLOAD_CHUNKS` + * else: + * `$min=0` + * `$max=$new_task_id` +4. A list of URLs is generated: `https://bugs.archlinux.org/task/{$min..$max}` +5. `wget` starts downloading the URLs, including page requisites and linked user pages +6. `xsltproc` is run on all the HTML files to cleanup the html (remove navbar entries, login form etc.) +7. `prettier` is run on all the HTML files to prettify the HTML (primarily fixing indentation) diff --git a/snapshotter.sh b/snapshotter.sh index 6331dfaa77..bfd9e7ee64 100755 --- a/snapshotter.sh +++ b/snapshotter.sh @@ -1,13 +1,17 @@ #!/bin/bash set -o nounset -o errexit -o pipefail ALBS_PARALLELIZE_DOWNLOAD="${ALBS_PARALLELIZE_DOWNLOAD:-false}" +ALBS_RANGE_DOWNLOAD_ENABLED="${ALBS_RANGE_DOWNLOAD_ENABLED:-false}" +ALBS_RANGE_DOWNLOAD_CHUNK="${ALBS_RANGE_DOWNLOAD_CHUNK:-$(date +%-d)}" +ALBS_RANGE_DOWNLOAD_CHUNKS="${ALBS_RANGE_DOWNLOAD_CHUNKS:-10}" +ALBS_RANGE_DOWNLOAD_CHUNK="$((ALBS_RANGE_DOWNLOAD_CHUNK % ALBS_RANGE_DOWNLOAD_CHUNKS + 1))" function get_newest_task_id() { wget --output-document=- "https://bugs.archlinux.org/index.php?project=0&status[]=&changedfrom=2021-04-01" | grep --perl-regexp --only-matching --max-count 1 "(?<=task)[0-9]+" } function generate_urls() { - eval "echo https://bugs.archlinux.org/task/{1..$1} | tr ' ' '\n'" + eval "echo https://bugs.archlinux.org/task/{$1..$2} | tr ' ' '\n'" } function _download() { @@ -71,13 +75,27 @@ function prettify() { function main() { local newest_task_id - if [ -n "${1:-}" ]; then + if [ -n "${1:-}" ] && [ "${1}" != -1 ]; then newest_task_id="${1}" else newest_task_id="$(get_newest_task_id)" fi + local min=1 + local max=${newest_task_id} + + if [[ ${ALBS_RANGE_DOWNLOAD_ENABLED} = true ]]; then + local max_tasks="$(( newest_task_id / ALBS_RANGE_DOWNLOAD_CHUNKS ))" + local overlap="$(( max_tasks * 5 / 100))" # 5% overlap + + min="$(( (ALBS_RANGE_DOWNLOAD_CHUNK - 1) * max_tasks))" + max="$((ALBS_RANGE_DOWNLOAD_CHUNK * max_tasks + overlap))" + if (( newest_task_id < max )); then + max="${newest_task_id}" + fi + fi + local urls - urls="$(generate_urls "${newest_task_id}")" + urls="$(generate_urls "${min}" "${max}")" local orig_pwd="${PWD}" local snapshot_dir="" if [ -z "${4:-}" ]; then @@ -87,6 +105,7 @@ function main() { snapshot_dir="${4}" fi cd "${snapshot_dir}" + echo "${min}-${max}" > range download "${2:-true}" "${urls}" cleanup_html "${orig_pwd}" if [ "${3:-true}" = "true" ]; then -- GitLab