Refactor archive scripts to prepare/create/upload
Sat down and thought about the structure here and how to make the full/delta stuff make more sense together! Here's what I came up with! In both full and delta archiving, we prepare the manifest, we create the local archive, then we upload it to remote.
This commit is contained in:
parent
35713069fa
commit
8dee9ddbed
14 changed files with 55 additions and 45 deletions
18
package.json
18
package.json
|
@ -85,15 +85,17 @@
|
||||||
"export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js",
|
"export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js",
|
||||||
"model-needed-items": "yarn run-script scripts/model-needed-items.js",
|
"model-needed-items": "yarn run-script scripts/model-needed-items.js",
|
||||||
"validate-owls-data": "yarn run-script scripts/validate-owls-data.js",
|
"validate-owls-data": "yarn run-script scripts/validate-owls-data.js",
|
||||||
"archive:create": "yarn archive:create:list-urls && yarn archive:create:download-urls && yarn archive:create:upload",
|
|
||||||
"archive:create:list-urls": "yarn run-script scripts/archive/create/list-urls.js",
|
|
||||||
"archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
|
|
||||||
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
|
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
|
||||||
"archive:create:upload": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR) s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)",
|
"archive:full": "yarn archive:prepare:full && yarn archive:create:full && yarn archive:upload:full",
|
||||||
"archive:create:upload-test": "yarn aws s3 sync $(dotenv -p ARCHIVE_DIR)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$(dotenv -p ARCHIVE_STORAGE_BUCKET)/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/",
|
"archive:delta": "yarn archive:prepare:delta && yarn archive:create:delta && yarn archive:upload:delta",
|
||||||
"archive:create:read-backup-state": "dotenv -- ./scripts/archive/create/read-backup-state.sh",
|
"archive:prepare:full": "yarn run-script scripts/archive/prepare/full.js",
|
||||||
"archive:create:compute-backup-delta": "dotenv -- ./scripts/archive/create/compute-backup-delta.sh",
|
"archive:prepare:remote": "dotenv -- ./scripts/archive/prepare/remote.sh",
|
||||||
"archive:create:download-urls-delta": "dotenv -- ./scripts/archive/create/download-urls-delta.sh"
|
"archive:prepare:delta": "dotenv -- ./scripts/archive/prepare/delta.sh",
|
||||||
|
"archive:create:full": "dotenv -- ./scripts/archive/create/full.sh",
|
||||||
|
"archive:create:delta": "dotenv -- ./scripts/archive/create/delta.sh",
|
||||||
|
"archive:upload:full": "dotenv -- ./scripts/archive/upload/full.sh",
|
||||||
|
"archive:upload:delta": "dotenv -- ./scripts/archive/upload/delta.sh",
|
||||||
|
"archive:upload:test": "dotenv -- ./scripts/archive/upload/test.sh"
|
||||||
},
|
},
|
||||||
"browserslist": {
|
"browserslist": {
|
||||||
"production": [
|
"production": [
|
||||||
|
|
5
scripts/archive/.gitignore
vendored
Normal file
5
scripts/archive/.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
/manifest-full.txt
|
||||||
|
/manifest-full.sorted.txt
|
||||||
|
/manifest-remote.txt
|
||||||
|
/manifest-remote.sorted.txt
|
||||||
|
/manifest-delta.txt
|
5
scripts/archive/create/.gitignore
vendored
5
scripts/archive/create/.gitignore
vendored
|
@ -1,5 +0,0 @@
|
||||||
/urls-cache.txt
|
|
||||||
/urls-cache.sorted.txt
|
|
||||||
/urls-cache-backup.txt
|
|
||||||
/urls-cache-backup.sorted.txt
|
|
||||||
/urls-cache-delta.txt
|
|
|
@ -1,24 +0,0 @@
|
||||||
# Sort urls-cache-backup.txt (what we already have backed up).
|
|
||||||
cat $(dirname $0)/urls-cache-backup.txt \
|
|
||||||
| \
|
|
||||||
sort \
|
|
||||||
| \
|
|
||||||
uniq - $(dirname $0)/urls-cache-backup.sorted.txt \
|
|
||||||
&& \
|
|
||||||
# Sort urls-cache.txt (what's available on images.neopets.com).
|
|
||||||
cat $(dirname $0)/urls-cache.txt \
|
|
||||||
| \
|
|
||||||
sort \
|
|
||||||
| \
|
|
||||||
uniq - $(dirname $0)/urls-cache.sorted.txt \
|
|
||||||
&& \
|
|
||||||
# Compute the diff between these two files, filtering to lines that start
|
|
||||||
# with "> ", meaning it's in urls-cache.txt but not in urls-cache-backup.txt.
|
|
||||||
diff $(dirname $0)/urls-cache-backup.sorted.txt $(dirname $0)/urls-cache.sorted.txt \
|
|
||||||
| \
|
|
||||||
grep '^>' \
|
|
||||||
| \
|
|
||||||
sed 's/^>\s*//' \
|
|
||||||
| \
|
|
||||||
# Output to urls-cache-delta.txt, and to the screen.
|
|
||||||
tee $(dirname $0)/urls-cache-delta.txt
|
|
1
scripts/archive/create/delta.sh
Executable file
1
scripts/archive/create/delta.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
MANIFEST=$(dirname $0)/../manifest-delta.txt ./download-urls.sh
|
|
@ -1,3 +0,0 @@
|
||||||
# Run archive:create:download-urls, but using our delta URLs file specifically.
|
|
||||||
URLS_CACHE=$(dirname $0)/urls-cache-delta.txt \
|
|
||||||
yarn archive:create:download-urls
|
|
|
@ -1,5 +1,5 @@
|
||||||
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
|
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
|
||||||
xargs --arg-file=${URLS_CACHE=$(dirname $0)/urls-cache.txt} -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
|
xargs --arg-file=$MANIFEST -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
|
||||||
|
|
||||||
# It's expected that xargs will exit with code 123 if wget failed to load some
|
# It's expected that xargs will exit with code 123 if wget failed to load some
|
||||||
# of the URLs. So, if it exited with 123, exit this script with 0 (success).
|
# of the URLs. So, if it exited with 123, exit this script with 0 (success).
|
||||||
|
|
1
scripts/archive/create/full.sh
Executable file
1
scripts/archive/create/full.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
MANIFEST=$(dirname $0)/../manifest-full.txt ./download-urls.sh
|
30
scripts/archive/prepare/delta.sh
Executable file
30
scripts/archive/prepare/delta.sh
Executable file
|
@ -0,0 +1,30 @@
|
||||||
|
# Prepare the full manifest of URLs.
|
||||||
|
yarn archive:prepare:full \
|
||||||
|
&& \
|
||||||
|
# Prepare the manifest of URLs already present on the remote archive.
|
||||||
|
yarn archive:prepare:remote \
|
||||||
|
&& \
|
||||||
|
# Sort manifest-remote.txt (what we already have backed up).
|
||||||
|
cat $(dirname $0)/../manifest-remote.txt \
|
||||||
|
| \
|
||||||
|
sort \
|
||||||
|
| \
|
||||||
|
uniq - $(dirname $0)/../manifest-remote.sorted.txt \
|
||||||
|
&& \
|
||||||
|
# Sort manifest-full.txt (what's available on images.neopets.com).
|
||||||
|
cat $(dirname $0)/../manifest-full.txt \
|
||||||
|
| \
|
||||||
|
sort \
|
||||||
|
| \
|
||||||
|
uniq - $(dirname $0)/../manifest-full.sorted.txt \
|
||||||
|
&& \
|
||||||
|
# Compute the diff between these two files, filtering to lines that start
|
||||||
|
# with "> ", meaning it's in manifest-full.txt but not in manifest-remote.txt.
|
||||||
|
diff $(dirname $0)/../manifest-remote.sorted.txt $(dirname $0)/../manifest-full.sorted.txt \
|
||||||
|
| \
|
||||||
|
grep '^>' \
|
||||||
|
| \
|
||||||
|
sed 's/^>\s*//' \
|
||||||
|
| \
|
||||||
|
# Output to manifest-delta.txt, and to the screen.
|
||||||
|
tee $(dirname $0)/../manifest-delta.txt
|
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* archive:create:list-urls generates a urls-cache.txt file, containing all of
|
* archive:prepare:full generates a manifest-full.txt file, containing all of
|
||||||
* the images.neopets.com URLs for customization that Dress to Impress is aware
|
* the images.neopets.com URLs for customization that Dress to Impress is aware
|
||||||
* of. This will enable us to back them all up in an archive!
|
* of. This will enable us to back them all up in an archive!
|
||||||
*
|
*
|
||||||
|
@ -75,7 +75,7 @@ async function main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function createUrlsCacheFile() {
|
async function createUrlsCacheFile() {
|
||||||
const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
|
const urlsCacheFilePath = path.join(__dirname, "../manifest-full.txt");
|
||||||
return await fs.open(urlsCacheFilePath, "w");
|
return await fs.open(urlsCacheFilePath, "w");
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,5 +10,5 @@ yarn aws s3 ls --recursive s3://dti-archive/ \
|
||||||
# Hacky urlencode; the only % value in URLs list today is %20, so...
|
# Hacky urlencode; the only % value in URLs list today is %20, so...
|
||||||
sed -E 's/ /%20/' \
|
sed -E 's/ /%20/' \
|
||||||
| \
|
| \
|
||||||
# Output to urls-cache-backup.txt, and print to the screen.
|
# Output to manifest-remote.txt, and print to the screen.
|
||||||
tee $(dirname $0)/urls-cache-backup.txt
|
tee $(dirname $0)/../manifest-remote.txt
|
1
scripts/archive/upload/delta.sh
Executable file
1
scripts/archive/upload/delta.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
echo 'archive:upload:delta -- TODO!'
|
1
scripts/archive/upload/full.sh
Executable file
1
scripts/archive/upload/full.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
yarn aws s3 sync $ARCHIVE_DIR s3://$ARCHIVE_STORAGE_BUCKET
|
1
scripts/archive/upload/test.sh
Executable file
1
scripts/archive/upload/test.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
yarn aws s3 sync $ARCHIVE_DIR/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/ s3://$ARCHIVE_STORAGE_BUCKET/images.neopets.com/cp/items/data/000/000/000/1_8422bedbf9/
|
Loading…
Reference in a new issue