From d9ca07c9b0b951a4a1f442545b1a438d5cfa790b Mon Sep 17 00:00:00 2001 From: Matchu Date: Mon, 12 Sep 2022 21:47:28 -0700 Subject: [PATCH] Use wget for archive:create:download-urls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hey this is an exciting development! A list of URLs, that we want to clone onto our hard drive, turns out to be something `wget` is already very good at! Originally I used `wget`'s `--input-file` option to process the `urls-cache.txt` file, but then I learned how to parallelize it from this StackOverflow answer: https://stackoverflow.com/a/11850469/107415. (Following the guidance in the comments, I removed `-n 1`, to avoid the overhead of extra processes and allow `wget` instances to keep using shared connections over time. Idk why it was in there, maybe the author didn't know `wget` accepts multiple args?) Anyway yeah, it's working great, except for the weird images.neopets.com downtime! 😅 Specifically I'm noticing that all the item thumbnail images came back really fast, but the customization images are taking for-EV-er. I wonder if that's just caching properties, or if there's a different backing server for it and it's responding much more slowly? Who's to say! In any case, I'm keeping the timeout in this script pretty low (10 seconds), and just letting failures fail. We can try re-running it again sometime when the downtime is resolved or the cache is warmed up. --- package.json | 3 ++- scripts/archive/create/download-urls.sh | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100755 scripts/archive/create/download-urls.sh diff --git a/package.json b/package.json index a86fbea..fc63bdd 100644 --- a/package.json +++ b/package.json @@ -83,7 +83,8 @@ "delete-user": "ts-node --compiler=typescript-cached-transpile --transpile-only -r dotenv/config scripts/delete-user.js", "export-users-to-auth0": "ts-node --compiler=typescript-cached-transpile --transpile-only -r dotenv/config scripts/export-users-to-auth0.js", "validate-owls-data": "ts-node --compiler=typescript-cached-transpile --transpile-only -r dotenv/config scripts/validate-owls-data.js", - "archive:create:list-urls": "ts-node --compiler=typescript-cached-transpile --transpile-only -r dotenv/config scripts/archive/create/list-urls.js" + "archive:create:list-urls": "ts-node --compiler=typescript-cached-transpile --transpile-only -r dotenv/config scripts/archive/create/list-urls.js", + "archive:create:download-urls": "./scripts/archive/create/download-urls.sh" }, "browserslist": { "production": [ diff --git a/scripts/archive/create/download-urls.sh b/scripts/archive/create/download-urls.sh new file mode 100755 index 0000000..12297bb --- /dev/null +++ b/scripts/archive/create/download-urls.sh @@ -0,0 +1,2 @@ +echo 'Starting! (Note: If many of the URLs are already downloaded, it will take some time for wget to quietly check them all and find the new ones.)' +xargs --arg-file=$(dirname $0)/urls-cache.txt -P 8 wget --directory-prefix=${ARCHIVE_DIR=$(dirname $0)} --force-directories --no-clobber --timeout=10 --retry-connrefused --retry-on-host-error --no-cookies --compression=auto --https-only --no-verbose \ No newline at end of file