Delta version of archive scripts

I like running the full `archive:create` to help us be _confident_ we've got the whole darn thing, but it takes multiple days to run on my machine and its slow HDD, which… I'm willing to do _sometimes_, but not frequently.

But if we had a version of the script that ran faster, and only on URLs we still _need_, we could run that more regularly and keep our live archive relatively up-to-date. This would enable us to build reliable fallback infra for when images.neopets.com isn't responding (like today lol)!

Anyway, I stopped early in this process because images.neopets.com is bad today, which means I can't really run updates today, lol :p but the delta-ing stuff seems to work, and takes closer to 30min to get the full state from the live archive, which is, y'know, still slow, but will make for a MUCH faster process than multiple days, lol
This commit is contained in:
Emi Matchu 2022-10-13 15:08:29 -07:00
parent 861f3ab881
commit 35713069fa
6 changed files with 51 additions and 3 deletions

View file

@ -90,7 +90,10 @@
"archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
"archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/"
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
"archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
"archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
"archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
},
"browserslist": {
"production": [

View file

@ -1 +1,5 @@
/urls-cache.txt
/urls-cache.sorted.txt
/urls-cache-backup.txt
/urls-cache-backup.sorted.txt
/urls-cache-delta.txt

View file

@ -0,0 +1,24 @@
# Sort urls-cache-backup.txt (what we already have backed up).
cat $(dirname $0)/urls-cache-backup.txt \
| \
sort \
| \
uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
&& \
# Sort urls-cache.txt (what's available on images.neopets.com).
cat $(dirname $0)/urls-cache.txt \
| \
sort \
| \
uniq - $(dirname $0)/urls-cache.sorted.txt \
&& \
# Compute the diff between these two files, filtering to lines that start
# with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
| \
grep '^>' \
| \
sed 's/^>\s*//' \
| \
# Output to urls-cache-delta.txt, and to the screen.
tee $(dirname $0)/urls-cache-delta.txt

View file

@ -0,0 +1,3 @@
# Run archive:create:download-urls, but using our delta URLs file specifically.
URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
yarn archive:create:download-urls

View file

@ -1,5 +1,5 @@
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
# It's expected that xargs will exit with code 123 if wget failed to load some
# of the URLs. So, if it exited with 123, exit this script with 0 (success).

View file

@ -0,0 +1,14 @@
# List all the files in our bucket. (The CLI handles pagination, thank you!)
yarn aws s3 ls --recursive s3://dti-archive/ \
| \
# Filter out unnecessary lines; just give us lines formatted like results.
grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+' \
| \
# Replace all the extra info like time and size with "https://".
sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \
| \
# Hacky urlencode; the only % value in URLs list today is %20, so...
sed -E 's/ /%20/' \
| \
# Output to urls-cache-backup.txt, and print to the screen.
tee $(dirname $0)/urls-cache-backup.txt