From b9b0db8b3a029db8af8bfe5de30fceaf4d6324fc Mon Sep 17 00:00:00 2001 From: Matchu Date: Sun, 2 Oct 2022 07:08:40 -0700 Subject: [PATCH] Some archive:create tweaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm looking into what it would take to update the archive on a regular basis. The commands right now *are* pretty good at avoiding duplicate work… but the S3 upload still seems like it's taking very long even to just validate what's in the archive already. We might have to build our own little cache rather than using `aws s3 sync`, if we want faster incremental updates? Here, I make a few quality-of-life changes to add a `archive:create` command that runs everything in a straight line. That way, I can let it run and see how much wall-time it takes, to be able to decide whether speeding it up feels necessary. (vs whether it's a few-hours task I can just set a reminder to manually run every week or something) --- package.json | 1 + scripts/archive/create/download-urls.sh | 15 ++++++++++++++- scripts/archive/create/list-urls.js | 19 ------------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/package.json b/package.json index 7deec33..58e6693 100644 --- a/package.json +++ b/package.json @@ -84,6 +84,7 @@ "delete-user": "yarn run-script scripts/delete-user.js", "export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js", "validate-owls-data": "yarn run-script scripts/validate-owls-data.js", + "archive:create": "yarn archive:create:list-urls && yarn archive:create:download-urls && yarn archive:create:upload", "archive:create:list-urls": "yarn run-script scripts/archive/create/list-urls.js", "archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh", "aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)", diff --git a/scripts/archive/create/download-urls.sh b/scripts/archive/create/download-urls.sh index 12297bb..e1ef541 100755 --- a/scripts/archive/create/download-urls.sh +++ b/scripts/archive/create/download-urls.sh @@ -1,2 +1,15 @@ echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)' -xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose \ No newline at end of file +xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose + +# It's expected that xargs will exit with code 123 if wget failed to load some +# of the URLs. So, if it exited with 123, exit this script with 0 (success). +# Otherwise, exit with the code that xargs exited with. +# (It would be nice if we could tell wget or xargs that a 404 isn't a failure? +# And have them succeed instead? But I couldn't find a way to do that!) +XARGS_EXIT_CODE=$? +if [ $XARGS_EXIT_CODE -eq 123 ] +then + exit 0 +else + exit $XARGS_EXIT_CODE +fi diff --git a/scripts/archive/create/list-urls.js b/scripts/archive/create/list-urls.js index afcf399..e55f467 100644 --- a/scripts/archive/create/list-urls.js +++ b/scripts/archive/create/list-urls.js @@ -29,15 +29,6 @@ const connectToDb = require("../../../src/server/db"); const { normalizeRow } = require("../../../src/server/util"); async function main() { - const urlsCacheFileAlreadyExists = await checkIfUrlsCacheFileAlreadyExists(); - if (urlsCacheFileAlreadyExists) { - console.error( - `urls-cache.txt already exists. Please remove it first if you really ` + - `want to rebuild it from scratch!` - ); - return 1; - } - const db = await connectToDb(); const file = await createUrlsCacheFile(); @@ -83,16 +74,6 @@ async function main() { console.info(`Done writing asset URLs.`); } -async function checkIfUrlsCacheFileAlreadyExists() { - const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt"); - try { - await fs.access(urlsCacheFilePath, fs.constants.R_OK); - } catch (error) { - return false; - } - return true; -} - async function createUrlsCacheFile() { const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt"); return await fs.open(urlsCacheFilePath, "w");