Refactor archive scripts to prepare/create/upload

Sat down and thought about the structure here and how to make the full/delta stuff make more sense together! Here's what I came up with!

In both full and delta archiving, we prepare the manifest, we create the local archive, then we upload it to remote.
This commit is contained in:
Emi Matchu 2022-10-13 16:07:12 -07:00
parent 35713069fa
commit 8dee9ddbed
14 changed files with 55 additions and 45 deletions

View file

@ -85,15 +85,17 @@
"export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js",
"model-needed-items": "yarn run-script scripts/model-needed-items.js",
"validate-owls-data": "yarn run-script scripts/validate-owls-data.js",
"archive:create": "yarn archive:create:list-urls && yarn archive:create:download-urls && yarn archive:create:upload",
"archive:create:list-urls": "yarn run-script scripts/archive/create/list-urls.js",
"archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
"archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
"archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
"archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
"archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
"archive:full": "yarn archive:prepare:full && yarn archive:create:full && yarn archive:upload:full",
"archive:delta": "yarn archive:prepare:delta && yarn archive:create:delta && yarn archive:upload:delta",
"archive:prepare:full": "yarn run-script scripts/archive/prepare/full.js",
"archive:prepare:remote": "dotenv -- ./scripts/archive/prepare/remote.sh",
"archive:prepare:delta": "dotenv -- ./scripts/archive/prepare/delta.sh",
"archive:create:full": "dotenv -- ./scripts/archive/create/full.sh",
"archive:create:delta": "dotenv -- ./scripts/archive/create/delta.sh",
"archive:upload:full": "dotenv -- ./scripts/archive/upload/full.sh",
"archive:upload:delta": "dotenv -- ./scripts/archive/upload/delta.sh",
"archive:upload:test": "dotenv -- ./scripts/archive/upload/test.sh"
},
"browserslist": {
"production": [

5
scripts/archive/.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
/manifest-full.txt
/manifest-full.sorted.txt
/manifest-remote.txt
/manifest-remote.sorted.txt
/manifest-delta.txt

View file

@ -1,5 +0,0 @@
/urls-cache.txt
/urls-cache.sorted.txt
/urls-cache-backup.txt
/urls-cache-backup.sorted.txt
/urls-cache-delta.txt

View file

@ -1,24 +0,0 @@
# Sort urls-cache-backup.txt (what we already have backed up).
cat $(dirname $0)/urls-cache-backup.txt \
| \
sort \
| \
uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
&& \
# Sort urls-cache.txt (what's available on images.neopets.com).
cat $(dirname $0)/urls-cache.txt \
| \
sort \
| \
uniq - $(dirname $0)/urls-cache.sorted.txt \
&& \
# Compute the diff between these two files, filtering to lines that start
# with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
| \
grep '^>' \
| \
sed 's/^>\s*//' \
| \
# Output to urls-cache-delta.txt, and to the screen.
tee $(dirname $0)/urls-cache-delta.txt

View file

@ -0,0 +1 @@
MANIFEST=$(dirname $0)/../manifest-delta.txt ./download-urls.sh

View file

@ -1,3 +0,0 @@
# Run archive:create:download-urls, but using our delta URLs file specifically.
URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
yarn archive:create:download-urls

View file

@ -1,5 +1,5 @@
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
xargs --arg-file=$MANIFEST -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
# It's expected that xargs will exit with code 123 if wget failed to load some
# of the URLs. So, if it exited with 123, exit this script with 0 (success).

1
scripts/archive/create/full.sh Executable file
View file

@ -0,0 +1 @@
MANIFEST=$(dirname $0)/../manifest-full.txt ./download-urls.sh

View file

@ -0,0 +1,30 @@
# Prepare the full manifest of URLs.
yarn archive:prepare:full \
&& \
# Prepare the manifest of URLs already present on the remote archive.
yarn archive:prepare:remote \
&& \
# Sort manifest-remote.txt (what we already have backed up).
cat $(dirname $0)/../manifest-remote.txt \
| \
sort \
| \
uniq - $(dirname $0)/../manifest-remote.sorted.txt \
&& \
# Sort manifest-full.txt (what's available on images.neopets.com).
cat $(dirname $0)/../manifest-full.txt \
| \
sort \
| \
uniq - $(dirname $0)/../manifest-full.sorted.txt \
&& \
# Compute the diff between these two files, filtering to lines that start
# with "> ", meaning it's in manifest-full.txt but not in manifest-remote.txt.
diff $(dirname $0)/../manifest-remote.sorted.txt $(dirname $0)/../manifest-full.sorted.txt \
| \
grep '^>' \
| \
sed 's/^>\s*//' \
| \
# Output to manifest-delta.txt, and to the screen.
tee $(dirname $0)/../manifest-delta.txt

View file

@ -1,5 +1,5 @@
/**
* archive:create:list-urls generates a urls-cache.txt file, containing all of
* archive:prepare:full generates a manifest-full.txt file, containing all of
* the images.neopets.com URLs for customization that Dress to Impress is aware
* of. This will enable us to back them all up in an archive!
*
@ -75,7 +75,7 @@ async function main() {
}
async function createUrlsCacheFile() {
const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
const urlsCacheFilePath = path.join(__dirname, "../manifest-full.txt");
return await fs.open(urlsCacheFilePath, "w");
}

View file

@ -10,5 +10,5 @@ yarn aws s3 ls --recursive s3://dti-archive/ \
# Hacky urlencode; the only % value in URLs list today is %20, so...
sed -E 's/ /%20/' \
| \
# Output to urls-cache-backup.txt, and print to the screen.
tee $(dirname $0)/urls-cache-backup.txt
# Output to manifest-remote.txt, and print to the screen.
tee $(dirname $0)/../manifest-remote.txt

View file

@ -0,0 +1 @@
echo 'archive:upload:delta -- TODO!'

1
scripts/archive/upload/full.sh Executable file
View file

@ -0,0 +1 @@
yarn aws s3 sync $ARCHIVE_DIR s3://$ARCHIVE_STORAGE_BUCKET

1
scripts/archive/upload/test.sh Executable file
View file

@ -0,0 +1 @@
yarn aws s3 sync $ARCHIVE_DIR/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$ARCHIVE_STORAGE_BUCKET/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/