Some archive:create tweaks

I'm looking into what it would take to update the archive on a regular basis. The commands right now *are* pretty good at avoiding duplicate work… but the S3 upload still seems like it's taking very long even to just validate what's in the archive already. We might have to build our own little cache rather than using `aws s3 sync`, if we want faster incremental updates?

Here, I make a few quality-of-life changes to add a `archive:create` command that runs everything in a straight line. That way, I can let it run and see how much wall-time it takes, to be able to decide whether speeding it up feels necessary. (vs whether it's a few-hours task I can just set a reminder to manually run every week or something)
This commit is contained in:
Emi Matchu 2022-10-02 07:08:40 -07:00
parent 07e2c0f7b1
commit b9b0db8b3a
3 changed files with 15 additions and 20 deletions

View file

@ -84,6 +84,7 @@
"delete-user": "yarn run-script scripts/delete-user.js",
"export-users-to-auth0": "yarn run-script scripts/export-users-to-auth0.js",
"validate-owls-data": "yarn run-script scripts/validate-owls-data.js",
"archive:create": "yarn archive:create:list-urls && yarn archive:create:download-urls && yarn archive:create:upload",
"archive:create:list-urls": "yarn run-script scripts/archive/create/list-urls.js",
"archive:create:download-urls": "dotenv -- ./scripts/archive/create/download-urls.sh",
"aws": "AWS_ACCESS_KEY_ID=$(dotenv -p ARCHIVE_STORAGE_READWRITE_ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(dotenv -p ARCHIVE_STORAGE_READWRITE_SECRET_KEY) aws --endpoint=https://$(dotenv -p ARCHIVE_STORAGE_HOST)",

View file

@ -1,2 +1,15 @@
echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)'
xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose
# It's expected that xargs will exit with code 123 if wget failed to load some
# of the URLs. So, if it exited with 123, exit this script with 0 (success).
# Otherwise, exit with the code that xargs exited with.
# (It would be nice if we could tell wget or xargs that a 404 isn't a failure?
# And have them succeed instead? But I couldn't find a way to do that!)
XARGS_EXIT_CODE=$?
if [ $XARGS_EXIT_CODE -eq 123 ]
then
exit 0
else
exit $XARGS_EXIT_CODE
fi

View file

@ -29,15 +29,6 @@ const connectToDb = require("../../../src/server/db");
const { normalizeRow } = require("../../../src/server/util");
async function main() {
const urlsCacheFileAlreadyExists = await checkIfUrlsCacheFileAlreadyExists();
if (urlsCacheFileAlreadyExists) {
console.error(
`urls-cache.txt already exists. Please remove it first if you really ` +
`want to rebuild it from scratch!`
);
return 1;
}
const db = await connectToDb();
const file = await createUrlsCacheFile();
@ -83,16 +74,6 @@ async function main() {
console.info(`Done writing asset URLs.`);
}
async function checkIfUrlsCacheFileAlreadyExists() {
const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
try {
await fs.access(urlsCacheFilePath, fs.constants.R_OK);
} catch (error) {
return false;
}
return true;
}
async function createUrlsCacheFile() {
const urlsCacheFilePath = path.join(__dirname, "urls-cache.txt");
return await fs.open(urlsCacheFilePath, "w");