Delta version of archive scripts
I like running the full `archive:create` to help us be _confident_ we've got the whole darn thing, but it takes multiple days to run on my machine and its slow HDD, which… I'm willing to do _sometimes_, but not frequently. But if we had a version of the script that ran faster, and only on URLs we still _need_, we could run that more regularly and keep our live archive relatively up-to-date. This would enable us to build reliable fallback infra for when images.neopets.com isn't responding (like today lol)! Anyway, I stopped early in this process because images.neopets.com is bad today, which means I can't really run updates today, lol :p but the delta-ing stuff seems to work, and takes closer to 30min to get the full state from the live archive, which is, y'know, still slow, but will make for a MUCH faster process than multiple days, lol
This commit is contained in:
parent
861f3ab881
commit
35713069fa
6 changed files with 51 additions and 3 deletions
|
@ -90,7 +90,10 @@
|
|||
"archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
|
||||
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
|
||||
"archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
|
||||
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/"
|
||||
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
|
||||
"archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
|
||||
"archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
|
||||
"archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
|
||||
},
|
||||
"browserslist": {
|
||||
"production": [
|
||||
|
|
4
scripts/archive/create/.gitignore
vendored
4
scripts/archive/create/.gitignore
vendored
|
@ -1 +1,5 @@
|
|||
/urls-cache.txt
|
||||
/urls-cache.sorted.txt
|
||||
/urls-cache-backup.txt
|
||||
/urls-cache-backup.sorted.txt
|
||||
/urls-cache-delta.txt
|
24
scripts/archive/create/compute-backup-delta.sh
Executable file
24
scripts/archive/create/compute-backup-delta.sh
Executable file
|
@ -0,0 +1,24 @@
|
|||
# Sort urls-cache-backup.txt (what we already have backed up).
|
||||
cat $(dirname $0)/urls-cache-backup.txt \
|
||||
| \
|
||||
sort \
|
||||
| \
|
||||
uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
|
||||
&& \
|
||||
# Sort urls-cache.txt (what's available on images.neopets.com).
|
||||
cat $(dirname $0)/urls-cache.txt \
|
||||
| \
|
||||
sort \
|
||||
| \
|
||||
uniq - $(dirname $0)/urls-cache.sorted.txt \
|
||||
&& \
|
||||
# Compute the diff between these two files, filtering to lines that start
|
||||
# with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
|
||||
diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
|
||||
| \
|
||||
grep '^>' \
|
||||
| \
|
||||
sed 's/^>\s*//' \
|
||||
| \
|
||||
# Output to urls-cache-delta.txt, and to the screen.
|
||||
tee $(dirname $0)/urls-cache-delta.txt
|
3
scripts/archive/create/download-urls-delta.sh
Executable file
3
scripts/archive/create/download-urls-delta.sh
Executable file
|
@ -0,0 +1,3 @@
|
|||
# Run archive:create:download-urls, but using our delta URLs file specifically.
|
||||
URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
|
||||
yarn archive:create:download-urls
|
|
@ -1,5 +1,5 @@
|
|||
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
|
||||
xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
|
||||
xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
|
||||
|
||||
# It's expected that xargs will exit with code 123 if wget failed to load some
|
||||
# of the URLs. So, if it exited with 123, exit this script with 0 (success).
|
||||
|
|
14
scripts/archive/create/read-backup-state.sh
Executable file
14
scripts/archive/create/read-backup-state.sh
Executable file
|
@ -0,0 +1,14 @@
|
|||
# List all the files in our bucket. (The CLI handles pagination, thank you!)
|
||||
yarn aws s3 ls --recursive s3://dti-archive/ \
|
||||
| \
|
||||
# Filter out unnecessary lines; just give us lines formatted like results.
|
||||
grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+' \
|
||||
| \
|
||||
# Replace all the extra info like time and size with "https://".
|
||||
sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \
|
||||
| \
|
||||
# Hacky urlencode; the only % value in URLs list today is %20, so...
|
||||
sed -E 's/ /%20/' \
|
||||
| \
|
||||
# Output to urls-cache-backup.txt, and print to the screen.
|
||||
tee $(dirname $0)/urls-cache-backup.txt
|
Loading…
Reference in a new issue