Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests. There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)
2022-09-12 17:26:11 -07:00 · 2022-09-12 17:26:11 -07:00 · ef9958c11e
commit ef9958c11e
parent 3ce895d52f
1 changed files with 177 additions and 11 deletions
--- a/scripts/archive/create/list-urls.js
+++ b/scripts/archive/create/list-urls.js
@ -3,12 +3,25 @@
 * the images.neopets.com URLs for customization that Dress to Impress is aware
 * of. This will enable us to back them all up in an archive!
 *
+ * This script doesn't make any external network requests. Instead, it only
+ * queries the DTI database, and extracts URLs from the data we already have.
+ *
+ * While this helps us be very fast, note that this depends on DTI's cached
+ * manifests in the database being relatively up-to-date. It might be wise to
+ * run the `cache-asset-manifests` script with the `--resync-existing` flag to
+ * update DTI's manifest cache before running this, if you want to ensure it's
+ * fully in sync!
+ *
+ * NOTE: This file will be in the hundreds of megabytes large! It's designed to
+ *       be an easy cache to write and read, and the trade-off for that is
+ *       being big and uncompressed. Delete it when you're done to save space!
+ *
 * Asset types:
 *   - Item thumbnails
 *   - Asset SWFs
- *   - Asset movie manifests
- *   - Asset movie images
- *   - Asset movie scripts
+ *   - Asset HTML5 manifests
+ *   - Asset HTML5 SVG images
+ *   - Asset HTML5 movie files (images and scripts)
 */
 const fs = require("fs/promises");
 const path = require("path");
@ -28,15 +41,36 @@ async function main() {
  const db = await connectToDb();
  const file = await createUrlsCacheFile();

-  const numItems = await loadNumItems(db);
-  for (let i = 0; i < numItems; i += 1000) {
-    console.info(`Loading items ${i + 1}–${i + 1000} of ${numItems}…`);
-    const items = await loadItems(i, 1000, db);
-    const thumbnailUrls = items.map((i) => i.thumbnailUrl);
-    const lines = thumbnailUrls.map((url) => url + "\n");
-    await file.write(lines.join(""), null, "utf8");
+  try {
+    const numItems = await loadNumItems(db);
+    for (let i = 0; i < numItems; i += 1000) {
+      console.info(`Loading items ${i + 1}–${i + 1000} of ${numItems}…`);
+      const items = await loadItems(i, 1000, db);
+      const urls = items.map((i) => i.thumbnailUrl);
+      const lines = urls.map((url) => url + "\n");
+      await file.write(lines.join(""), null, "utf8");
+    }
+    console.info(`Done writing item URLs.`);
+
+    const numSwfAssets = await loadNumSwfAssets(db);
+    for (let i = 0; i < numSwfAssets; i += 1000) {
+      console.info(`Loading assets ${i + 1}–${i + 1000} of ${numSwfAssets}…`);
+      const swfAssets = await loadSwfAssets(i, 1000, db);
+      const urls = [];
+      for (const swfAsset of swfAssets) {
+        urls.push(swfAsset.url);
+        urls.push(
+          ...getHTML5UrlsFromManifestContent(swfAsset.id, swfAsset.manifest)
+        );
+      }
+      const lines = urls.map((url) => url + "\n");
+      await file.write(lines.join(""), null, "utf8");
+    }
+  } finally {
+    await file.close();
  }
-  console.info(`Done writing item URLs.`);
+
+  console.info(`Done writing asset URLs.`);
 }

 async function checkIfUrlsCacheFileAlreadyExists() {
@ -69,6 +103,138 @@ async function loadItems(offset, limit, db) {
  return rows.map(normalizeRow);
 }

+async function loadNumSwfAssets(db) {
+  const [rows] = await db.query(`SELECT count(*) FROM swf_assets`);
+  return rows[0]["count(*)"];
+}
+
+async function loadSwfAssets(offset, limit, db) {
+  const [
+    rows,
+  ] = await db.query(
+    `SELECT id, url, manifest FROM swf_assets ORDER BY id LIMIT ? OFFSET ?;`,
+    [limit, offset]
+  );
+  return rows.map(normalizeRow);
+}
+
+const ASSET_DIRNAME_PATTERN = /^cp\/(bio|items)\/data\/[0-9]{3}\/[0-9]{3}\/[0-9]{3}\/[0-9]+(?:_[0-9a-f]{10})?/;
+const PUBLIC_HTML_PATTERN = /^\/home\/images\/public_html\//;
+
+/**
+ * getHTML5UrlsFromManifestContent takes the manifest content cached in DTI's
+ * database, and extracts the URLs it references, including the URL for the
+ * manifest itself.
+ *
+ * NOTE: Another approach we could take here would be to, like DTI itself
+ *       does, fetch both manifest locations and see which one answers. The
+ *       clear downside is that this would be a lot of network effort and slow
+ *       down this script by a ton! There's a case to be made that it's a more
+ *       robust idea, though, in case DTI's manifest cache is out of date. That
+ *       might be worth checking! That said, the mitigation described at the
+ *       top of the script (using `cache-asset-manifests`) is probably enough.
+ */
+function getHTML5UrlsFromManifestContent(assetId, manifestContent) {
+  if (!manifestContent) {
+    return [];
+  }
+
+  let manifestData;
+  try {
+    manifestData = JSON.parse(manifestContent);
+  } catch (error) {
+    console.warn(
+      `[Asset ${assetId}] Could not parse manifest as JSON; ` +
+        `skipping movie URLs: ${error.message}`
+    );
+    return [];
+  }
+
+  let paths;
+  try {
+    // NOTE: The format we store in the database is a bit different than the
+    //       manifest.json's original format. We remove the cpmanifest wrapper
+    //       object, we camelCase the keys, and we rename `url` to `path` for
+    //       increased clarity about what it is.
+    const assets = manifestData.assets;
+    const assetDatas = assets.map((a) => a.assetData).flat();
+    paths = assetDatas.map((ad) => ad.path);
+  } catch (error) {
+    console.warn(
+      `[Asset ${assetId}] Manifest data was not of the expected structure; ` +
+        `skipping movie URLs: ${error.message}.`
+    );
+    return [];
+  }
+
+  if (paths.length === 0) {
+    console.warn(`[Asset ${assetId}] Manifest contained no URLs; skipping.`);
+    return [];
+  }
+
+  // Starting with asset #539903, when manifests reference themselves, they
+  // prepend `/home/images/public_html/` to the path, oops! Strip those out.
+  paths = paths.map((p) => p.replace(PUBLIC_HTML_PATTERN, ""));
+
+  // Some manifest files reference themselves, but others don't. If this one
+  // doesn't, we can infer it from the other paths: it should be at
+  // `manifest.json` in the same directory.
+  //
+  // We can't just infer it from e.g. the SWF URL, because there are two
+  // locations where a manifest file can be. As an example, for the following
+  // SWF URL:
+  // - https://images.neopets.com/cp/items/swf/000/000/005/5735_a8feda8d08.swf
+  // ...the manifest might be either of the following:
+  // - https://images.neopets.com/cp/items/data/000/000/005/5735_a8feda8d08/manifest.json
+  // - https://images.neopets.com/cp/items/data/000/000/005/5735/manifest.json
+  // (incidentally, it's the first one in this case!)
+  if (!paths.some((p) => p.split("?")[0].endsWith("manifest.json"))) {
+    try {
+      paths.push(inferManifestPath(paths));
+    } catch (error) {
+      console.warn(
+        `[Asset ${assetId}] Skipping manifest URL: ${error.message}`
+      );
+    }
+  }
+
+  const urls = paths.map((path) =>
+    new URL(path, "https://images.neopets.com/").toString()
+  );
+
+  // Sometimes the manifest contains some duplicate URLs, so remove those.
+  const uniqueUrls = [...new Set(urls)];
+
+  return uniqueUrls;
+}
+
+function inferManifestPath(paths) {
+  // Sometimes there are subdirectories, so we can't *just* look at the
+  // dirname of the files. Instead, we observe that assets generally go in a
+  // folder like: `cp/<bio|object>/data/123/456/789/123456789[_abcdef1234]`,
+  // so we look for that prefix in the first path, check all the other paths
+  // against it for good measure, then assume that `manifest.json` will be in
+  // there too.
+  const baseDirectoryMatch = paths[0].match(ASSET_DIRNAME_PATTERN);
+  if (baseDirectoryMatch == null) {
+    throw new Error(
+      `Could not infer manifest path: the paths in the ` +
+        `manifest don't match the expected format. (The first path was: ` +
+        `${JSON.stringify(paths[0])})`
+    );
+  }
+
+  const baseDirectory = baseDirectoryMatch[0];
+  if (!paths.every((p) => p.startsWith(baseDirectory))) {
+    throw new Error(
+      `Could not infer manifest path: the paths in the manifest don't all ` +
+        `belong to the same base directory.`
+    );
+  }
+
+  return `${baseDirectory}/manifest.json`;
+}
+
 main()
  .catch((e) => {
    console.error(e);