From 35713069fa71f34a5b0543931e29437703d3fe34 Mon Sep 17 00:00:00 2001
From: Matchu <matchu1993@gmail.com>
Date: Thu, 13 Oct 2022 15:08:29 -0700
Subject: [PATCH] Delta version of archive scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I like running the full `archive:create` to help us be _confident_ we've got the whole darn thing, but it takes multiple days to run on my machine and its slow HDD, which… I'm willing to do _sometimes_, but not frequently.

But if we had a version of the script that ran faster, and only on URLs we still _need_, we could run that more regularly and keep our live archive relatively up-to-date. This would enable us to build reliable fallback infra for when images.neopets.com isn't responding (like today lol)!

Anyway, I stopped early in this process because images.neopets.com is bad today, which means I can't really run updates today, lol :p but the delta-ing stuff seems to work, and takes closer to 30min to get the full state from the live archive, which is, y'know, still slow, but will make for a MUCH faster process than multiple days, lol
---
 package.json                                  |  5 +++-
 scripts/archive/create/.gitignore             |  6 ++++-
 .../archive/create/compute-backup-delta.sh    | 24 +++++++++++++++++++
 scripts/archive/create/download-urls-delta.sh |  3 +++
 scripts/archive/create/download-urls.sh       |  2 +-
 scripts/archive/create/read-backup-state.sh   | 14 +++++++++++
 6 files changed, 51 insertions(+), 3 deletions(-)
 create mode 100755 scripts/archive/create/compute-backup-delta.sh
 create mode 100755 scripts/archive/create/download-urls-delta.sh
 create mode 100755 scripts/archive/create/read-backup-state.sh

diff --git a/package.json b/package.json
index aa2b21a..b8c9f25 100644
--- a/package.json
+++ b/package.json
@@ -90,7 +90,10 @@
     "archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
     "aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
     "archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
-    "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/"
+    "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
+    "archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
+    "archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
+    "archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
   },
   "browserslist": {
     "production": [
diff --git a/scripts/archive/create/.gitignore b/scripts/archive/create/.gitignore
index 3e86381..8b794ed 100644
--- a/scripts/archive/create/.gitignore
+++ b/scripts/archive/create/.gitignore
@@ -1 +1,5 @@
-/urls-cache.txt
\ No newline at end of file
+/urls-cache.txt
+/urls-cache.sorted.txt
+/urls-cache-backup.txt
+/urls-cache-backup.sorted.txt
+/urls-cache-delta.txt
\ No newline at end of file
diff --git a/scripts/archive/create/compute-backup-delta.sh b/scripts/archive/create/compute-backup-delta.sh
new file mode 100755
index 0000000..c30d244
--- /dev/null
+++ b/scripts/archive/create/compute-backup-delta.sh
@@ -0,0 +1,24 @@
+# Sort urls-cache-backup.txt (what we already have backed up).
+cat $(dirname $0)/urls-cache-backup.txt \
+  | \
+  sort \
+  | \
+  uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
+  && \
+  # Sort urls-cache.txt (what's available on images.neopets.com).
+  cat $(dirname $0)/urls-cache.txt \
+  | \
+  sort \
+  | \
+  uniq - $(dirname $0)/urls-cache.sorted.txt \
+  && \
+  # Compute the diff between these two files, filtering to lines that start
+  # with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
+  diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
+  | \
+  grep '^>' \
+  | \
+  sed 's/^>\s*//' \
+  | \
+  # Output to urls-cache-delta.txt, and to the screen.
+  tee $(dirname $0)/urls-cache-delta.txt
\ No newline at end of file
diff --git a/scripts/archive/create/download-urls-delta.sh b/scripts/archive/create/download-urls-delta.sh
new file mode 100755
index 0000000..063ece6
--- /dev/null
+++ b/scripts/archive/create/download-urls-delta.sh
@@ -0,0 +1,3 @@
+# Run archive:create:download-urls, but using our delta URLs file specifically.
+URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
+  yarn archive:create:download-urls
\ No newline at end of file
diff --git a/scripts/archive/create/download-urls.sh b/scripts/archive/create/download-urls.sh
index e1ef541..aca861e 100755
--- a/scripts/archive/create/download-urls.sh
+++ b/scripts/archive/create/download-urls.sh
@@ -1,5 +1,5 @@
 echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
-xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
+xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
 
 # It's expected that xargs will exit with code 123 if wget failed to load some
 # of the URLs. So, if it exited with 123, exit this script with 0 (success).
diff --git a/scripts/archive/create/read-backup-state.sh b/scripts/archive/create/read-backup-state.sh
new file mode 100755
index 0000000..7b88bb0
--- /dev/null
+++ b/scripts/archive/create/read-backup-state.sh
@@ -0,0 +1,14 @@
+# List all the files in our bucket. (The CLI handles pagination, thank you!)
+yarn aws s3 ls --recursive s3://dti-archive/ \
+  | \
+  # Filter out unnecessary lines; just give us lines formatted like results.
+  grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+' \
+  | \
+  # Replace all the extra info like time and size with "https://".
+  sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \
+  | \
+  # Hacky urlencode; the only % value in URLs list today is %20, so...
+  sed -E 's/ /%20/' \
+  | \
+  # Output to urls-cache-backup.txt, and print to the screen.
+  tee $(dirname $0)/urls-cache-backup.txt
\ No newline at end of file