Some archive:create tweaks

I'm looking into what it would take to update the archive on a regular basis. The commands right now *are* pretty good at avoiding duplicate work… but the S3 upload still seems like it's taking very long even to just validate what's in the archive already. We might have to build our own little cache rather than using `aws s3 sync`, if we want faster incremental updates? Here, I make a few quality-of-life changes to add a `archive:create` command that runs everything in a straight line. That way, I can let it run and see how much wall-time it takes, to be able to decide whether speeding it up feels necessary. (vs whether it's a few-hours task I can just set a reminder to manually run every week or something)
2022-10-02 07:08:40 -07:00 · 2022-10-02 07:08:40 -07:00 · b9b0db8b3a
commit b9b0db8b3a
parent 07e2c0f7b1
3 changed files with 15 additions and 20 deletions
--- a/package.json
+++ b/package.json
@ -84,6 +84,7 @@
    "delete-user": "yarn run-script scripts/delete-user.js",
    "export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js",
    "validate-owls-data": "yarn run-script scripts/validate-owls-data.js",
+    "archive:create": "yarn archive:create:list-urls && yarn archive:create:download-urls && yarn archive:create:upload",
    "archive:create:list-urls": "yarn run-script scripts/archive/create/list-urls.js",
    "archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
    "aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",
--- a/scripts/archive/create/download-urls.sh
+++ b/scripts/archive/create/download-urls.sh
@ -1,2 +1,15 @@
 echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
-xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
+xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
+
+# It's expected that xargs will exit with code 123 if wget failed to load some
+# of the URLs. So, if it exited with 123, exit this script with 0 (success).
+# Otherwise, exit with the code that xargs exited with.
+# (It would be nice if we could tell wget or xargs that a 404 isn't a failure?
+# And have them succeed instead? But I couldn't find a way to do that!)
+XARGS_EXIT_CODE=$?
+if [ $XARGS_EXIT_CODE -eq 123 ]
+then
+  exit 0
+else
+  exit $XARGS_EXIT_CODE
+fi
--- a/scripts/archive/create/list-urls.js
+++ b/scripts/archive/create/list-urls.js
@ -29,15 +29,6 @@ const connectToDb = require("../../../src/server/db");
 const { normalizeRow } = require("../../../src/server/util");

 async function main() {
-  const urlsCacheFileAlreadyExists = await checkIfUrlsCacheFileAlreadyExists();
-  if (urlsCacheFileAlreadyExists) {
-    console.error(
-      `urls-cache.txt already exists. Please remove it first if you really ` +
-        `want to rebuild it from scratch!`
-    );
-    return 1;
-  }
-
  const db = await connectToDb();
  const file = await createUrlsCacheFile();

@ -83,16 +74,6 @@ async function main() {
  console.info(`Done writing asset URLs.`);
 }

-async function checkIfUrlsCacheFileAlreadyExists() {
-  const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
-  try {
-    await fs.access(urlsCacheFilePath, fs.constants.R_OK);
-  } catch (error) {
-    return false;
-  }
-  return true;
-}
-
 async function createUrlsCacheFile() {
  const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
  return await fs.open(urlsCacheFilePath, "w");