From 88511d3dc6890f994ad882e1d9f7871ba6e37edf Mon Sep 17 00:00:00 2001 From: Matchu Date: Sat, 5 Nov 2022 02:15:31 -0700 Subject: [PATCH] Implement archive:upload:delta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ok great, we can now run the delta archive process! It'd be nice to get this running on cron on the impress-2020 server, to a temporary folder? I *do* want to be remembering to run something regularly on my personal machine too though, to keep my own copy up-to-dateā€¦ --- scripts/archive/prepare/remote.sh | 2 +- scripts/archive/upload/delta.sh | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/scripts/archive/prepare/remote.sh b/scripts/archive/prepare/remote.sh index f267811..7443db8 100755 --- a/scripts/archive/prepare/remote.sh +++ b/scripts/archive/prepare/remote.sh @@ -8,7 +8,7 @@ yarn aws s3 ls --recursive s3://dti-archive/ \ sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}\s+[0-9]+\s+/https:\/\//' \ | \ # Hacky urlencode; the only % value in URLs list today is %20, so... - sed -E 's/ /%20/' \ + sed -E 's/ /%20/g' \ | \ # Output to manifest-remote.txt, and print to the screen. tee $(dirname $0)/../manifest-remote.txt \ No newline at end of file diff --git a/scripts/archive/upload/delta.sh b/scripts/archive/upload/delta.sh index 2f08c1b..bbd3680 100755 --- a/scripts/archive/upload/delta.sh +++ b/scripts/archive/upload/delta.sh @@ -1 +1,20 @@ -echo 'archive:upload:delta -- TODO!' \ No newline at end of file +cat $(dirname $0)/../manifest-delta.txt \ + | \ + # Remove the URL scheme to convert it to a folder path in our archive + sed -E 's/^https?:\/\///' \ + | \ + # Hacky urldecode; the only % value in the URLs list today is %20, so... + sed -E 's/%20/ /g' \ + | \ + # Upload each URL to the remote archive! + # NOTE: This is slower than I'd hoped, probably because each command has to + # set up a new connection? If we needed to be faster, we could refactor + # the `create` step to download to a temporary delta folder, then `cp` + # that into the main archive, but run `aws s3 sync` on just the delta + # folder (with care not to delete keys that are present in the remote + # archive but not in the delta folder!). But this seems to run at an + # acceptable speed (i.e. a few hours) when it's run daily. + while read -r path; do + yarn aws s3 cp $ARCHIVE_DIR/$path s3://$ARCHIVE_STORAGE_BUCKET/$path; + done + \ No newline at end of file