Delta version of archive scripts

I like running the full `archive:create` to help us be _confident_ we've got the whole darn thing, but it takes multiple days to run on my machine and its slow HDD, which… I'm willing to do _sometimes_, but not frequently. But if we had a version of the script that ran faster, and only on URLs we still _need_, we could run that more regularly and keep our live archive relatively up-to-date. This would enable us to build reliable fallback infra for when images.neopets.com isn't responding (like today lol)! Anyway, I stopped early in this process because images.neopets.com is bad today, which means I can't really run updates today, lol :p but the delta-ing stuff seems to work, and takes closer to 30min to get the full state from the live archive, which is, y'know, still slow, but will make for a MUCH faster process than multiple days, lol
2022-10-13 15:08:29 -07:00 · 2022-10-13 15:08:29 -07:00 · 35713069fa
commit 35713069fa
parent 861f3ab881
6 changed files with 51 additions and 3 deletions
--- a/package.json
+++ b/package.json
@ -90,7 +90,10 @@
    "archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
    "aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
    "archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
-    "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/"
+    "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
+    "archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
+    "archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
+    "archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
  },
  "browserslist": {
    "production": [
--- a/scripts/archive/create/.gitignore
+++ b/scripts/archive/create/.gitignore
@ -1 +1,5 @@
-/urls-cache.txt
+/urls-cache.txt
+/urls-cache.sorted.txt
+/urls-cache-backup.txt
+/urls-cache-backup.sorted.txt
+/urls-cache-delta.txt
--- a/scripts/archive/create/compute-backup-delta.sh
+++ b/scripts/archive/create/compute-backup-delta.sh
@ -0,0 +1,24 @@
+# Sort urls-cache-backup.txt (what we already have backed up).
+cat $(dirname $0)/urls-cache-backup.txt \
+  | \
+  sort \
+  | \
+  uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
+  && \
+  # Sort urls-cache.txt (what's available on images.neopets.com).
+  cat $(dirname $0)/urls-cache.txt \
+  | \
+  sort \
+  | \
+  uniq - $(dirname $0)/urls-cache.sorted.txt \
+  && \
+  # Compute the diff between these two files, filtering to lines that start
+  # with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
+  diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
+  | \
+  grep '^>' \
+  | \
+  sed 's/^>\s*//' \
+  | \
+  # Output to urls-cache-delta.txt, and to the screen.
+  tee $(dirname $0)/urls-cache-delta.txt
--- a/scripts/archive/create/download-urls-delta.sh
+++ b/scripts/archive/create/download-urls-delta.sh
@ -0,0 +1,3 @@
+# Run archive:create:download-urls, but using our delta URLs file specifically.
+URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
+  yarn archive:create:download-urls
--- a/scripts/archive/create/download-urls.sh
+++ b/scripts/archive/create/download-urls.sh
@ -1,5 +1,5 @@
 echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
-xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
+xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose

 # It's expected that xargs will exit with code 123 if wget failed to load some
 # of the URLs. So, if it exited with 123, exit this script with 0 (success).
--- a/scripts/archive/create/read-backup-state.sh
+++ b/scripts/archive/create/read-backup-state.sh
@ -0,0 +1,14 @@
+# List all the files in our bucket. (The CLI handles pagination, thank you!)
+yarn aws s3 ls --recursive s3://dti-archive/ \
+  | \
+  # Filter out unnecessary lines; just give us lines formatted like results.
+  grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+' \
+  | \
+  # Replace all the extra info like time and size with "https://".
+  sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \
+  | \
+  # Hacky urlencode; the only % value in URLs list today is %20, so...
+  sed -E 's/ /%20/' \
+  | \
+  # Output to urls-cache-backup.txt, and print to the screen.
+  tee $(dirname $0)/urls-cache-backup.txt