From 35713069fa71f34a5b0543931e29437703d3fe34 Mon Sep 17 00:00:00 2001 From: Matchu Date: Thu, 13 Oct 2022 15:08:29 -0700 Subject: [PATCH] Delta version of archive scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I like running the full `archive:create` to help us be _confident_ we've got the whole darn thing, but it takes multiple days to run on my machine and its slow HDD, which… I'm willing to do _sometimes_, but not frequently. But if we had a version of the script that ran faster, and only on URLs we still _need_, we could run that more regularly and keep our live archive relatively up-to-date. This would enable us to build reliable fallback infra for when images.neopets.com isn't responding (like today lol)! Anyway, I stopped early in this process because images.neopets.com is bad today, which means I can't really run updates today, lol :p but the delta-ing stuff seems to work, and takes closer to 30min to get the full state from the live archive, which is, y'know, still slow, but will make for a MUCH faster process than multiple days, lol --- package.json | 5 +++- scripts/archive/create/.gitignore | 6 ++++- .../archive/create/compute-backup-delta.sh | 24 +++++++++++++++++++ scripts/archive/create/download-urls-delta.sh | 3 +++ scripts/archive/create/download-urls.sh | 2 +- scripts/archive/create/read-backup-state.sh | 14 +++++++++++ 6 files changed, 51 insertions(+), 3 deletions(-) create mode 100755 scripts/archive/create/compute-backup-delta.sh create mode 100755 scripts/archive/create/download-urls-delta.sh create mode 100755 scripts/archive/create/read-backup-state.sh diff --git a/package.json b/package.json index aa2b21a..b8c9f25 100644 --- a/package.json +++ b/package.json @@ -90,7 +90,10 @@ "archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh", "aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)", "archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)", - "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/" + "archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/", + "archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh", + "archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh", + "archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh" }, "browserslist": { "production": [ diff --git a/scripts/archive/create/.gitignore b/scripts/archive/create/.gitignore index 3e86381..8b794ed 100644 --- a/scripts/archive/create/.gitignore +++ b/scripts/archive/create/.gitignore @@ -1 +1,5 @@ -/urls-cache.txt \ No newline at end of file +/urls-cache.txt +/urls-cache.sorted.txt +/urls-cache-backup.txt +/urls-cache-backup.sorted.txt +/urls-cache-delta.txt \ No newline at end of file diff --git a/scripts/archive/create/compute-backup-delta.sh b/scripts/archive/create/compute-backup-delta.sh new file mode 100755 index 0000000..c30d244 --- /dev/null +++ b/scripts/archive/create/compute-backup-delta.sh @@ -0,0 +1,24 @@ +# Sort urls-cache-backup.txt (what we already have backed up). +cat $(dirname $0)/urls-cache-backup.txt \ + | \ + sort \ + | \ + uniq - $(dirname $0)/urls-cache-backup.sorted.txt \ + && \ + # Sort urls-cache.txt (what's available on images.neopets.com). + cat $(dirname $0)/urls-cache.txt \ + | \ + sort \ + | \ + uniq - $(dirname $0)/urls-cache.sorted.txt \ + && \ + # Compute the diff between these two files, filtering to lines that start + # with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt. + diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \ + | \ + grep '^>' \ + | \ + sed 's/^>\s*//' \ + | \ + # Output to urls-cache-delta.txt, and to the screen. + tee $(dirname $0)/urls-cache-delta.txt \ No newline at end of file diff --git a/scripts/archive/create/download-urls-delta.sh b/scripts/archive/create/download-urls-delta.sh new file mode 100755 index 0000000..063ece6 --- /dev/null +++ b/scripts/archive/create/download-urls-delta.sh @@ -0,0 +1,3 @@ +# Run archive:create:download-urls, but using our delta URLs file specifically. +URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \ + yarn archive:create:download-urls \ No newline at end of file diff --git a/scripts/archive/create/download-urls.sh b/scripts/archive/create/download-urls.sh index e1ef541..aca861e 100755 --- a/scripts/archive/create/download-urls.sh +++ b/scripts/archive/create/download-urls.sh @@ -1,5 +1,5 @@ echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)' -xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose +xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose # It's expected that xargs will exit with code 123 if wget failed to load some # of the URLs. So, if it exited with 123, exit this script with 0 (success). diff --git a/scripts/archive/create/read-backup-state.sh b/scripts/archive/create/read-backup-state.sh new file mode 100755 index 0000000..7b88bb0 --- /dev/null +++ b/scripts/archive/create/read-backup-state.sh @@ -0,0 +1,14 @@ +# List all the files in our bucket. (The CLI handles pagination, thank you!) +yarn aws s3 ls --recursive s3://dti-archive/ \ + | \ + # Filter out unnecessary lines; just give us lines formatted like results. + grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+' \ + | \ + # Replace all the extra info like time and size with "https://". + sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \ + | \ + # Hacky urlencode; the only % value in URLs list today is %20, so... + sed -E 's/ /%20/' \ + | \ + # Output to urls-cache-backup.txt, and print to the screen. + tee $(dirname $0)/urls-cache-backup.txt \ No newline at end of file