impress-2020/scripts/archive/prepare/full.js

/**
 * archive:prepare:full generates a manifest-full.txt file, containing all of
 * the images.neopets.com URLs for customization that Dress to Impress is aware
 * of. This will enable us to back them all up in an archive!
 *
 * This script doesn't make any external network requests. Instead, it only
 * queries the DTI database, and extracts URLs from the data we already have.
 *
 * While this helps us be very fast, note that this depends on DTI's cached
 * manifests in the database being relatively up-to-date. It might be wise to
 * run the `cache-asset-manifests` script with the `--resync-existing` flag to
 * update DTI's manifest cache before running this, if you want to ensure it's
 * fully in sync!
 *
 * NOTE: This file will be in the hundreds of megabytes large! It's designed to
 *       be an easy cache to write and read, and the trade-off for that is
 *       being big and uncompressed. Delete it when you're done to save space!
 *
 * Asset types:
 *   - Item thumbnails
 *   - Asset SWFs
 *   - Asset HTML5 manifests
 *   - Asset HTML5 SVG images
 *   - Asset HTML5 movie files (images and scripts)
 */
const fs = require("fs/promises");
const path = require("path");
const connectToDb = require("../../../src/server/db");
const { normalizeRow } = require("../../../src/server/util");

async function main() {
  const db = await connectToDb();
  const file = await createUrlsCacheFile();

  try {
    const numItems = await loadNumItems(db);
    for (let i = 0; i < numItems; i += 1000) {
      console.info(`Loading items ${i + 1}–${i + 1000} of ${numItems}…`);
      const items = await loadItems(i, 1000, db);
      const urls = [];
      for (const item of items) {
        const thumbnailUrl = sanitizeUrl(item.thumbnailUrl, `Item ${item.id}`);
        urls.push(thumbnailUrl);
      }
      const lines = urls.map((url) => url + "\n");
      await file.write(lines.join(""), null, "utf8");
    }
    console.info(`Done writing item URLs.`);

    const numSwfAssets = await loadNumSwfAssets(db);
    for (let i = 0; i < numSwfAssets; i += 1000) {
      console.info(`Loading assets ${i + 1}–${i + 1000} of ${numSwfAssets}…`);
      const swfAssets = await loadSwfAssets(i, 1000, db);
      const urls = [];

      for (const swfAsset of swfAssets) {
        const swfUrl = sanitizeUrl(swfAsset.url, `Asset ${swfAsset.id}`);
        const html5Urls = getHTML5UrlsFromManifestContent(
          swfAsset.id,
          swfAsset.manifest
        ).map((url) => sanitizeUrl(url, `Asset ${swfAsset.id}`));

        urls.push(swfUrl);
        urls.push(...html5Urls);
      }

      const lines = urls.map((url) => url + "\n");
      await file.write(lines.join(""), null, "utf8");
    }
  } finally {
    await file.close();
  }

  console.info(`Done writing asset URLs.`);
}

async function createUrlsCacheFile() {
  const urlsCacheFilePath = path.join(__dirname, "../manifest-full.txt");
  return await fs.open(urlsCacheFilePath, "w");
}

async function loadNumItems(db) {
  const [rows] = await db.query(`SELECT count(*) FROM items`);
  return rows[0]["count(*)"];
}

async function loadItems(offset, limit, db) {
  const [
    rows,
  ] = await db.query(
    {sql: `SELECT id, thumbnail_url FROM items ORDER BY id LIMIT ? OFFSET ?;`, timeout: 20000},
    [limit, offset]
  );
  return rows.map(normalizeRow);
}

async function loadNumSwfAssets(db) {
  const [rows] = await db.query(`SELECT count(*) FROM swf_assets`);
  return rows[0]["count(*)"];
}

async function loadSwfAssets(offset, limit, db) {
  const [
    rows,
  ] = await db.query(
    {sql: `SELECT id, url, manifest FROM swf_assets ORDER BY id LIMIT ? OFFSET ?;`, timeout: 20000},
    [limit, offset]
  );
  return rows.map(normalizeRow);
}

const ASSET_DIRNAME_PATTERN = /^cp\/(bio|items)\/data\/[0-9]{3}\/[0-9]{3}\/[0-9]{3}\/[0-9]+(?:_[0-9a-f]{10})?/;
const PUBLIC_HTML_PATTERN = /^\/home\/images\/public_html\//;

/**
 * getHTML5UrlsFromManifestContent takes the manifest content cached in DTI's
 * database, and extracts the URLs it references, including the URL for the
 * manifest itself.
 *
 * NOTE: Another approach we could take here would be to, like DTI itself
 *       does, fetch both manifest locations and see which one answers. The
 *       clear downside is that this would be a lot of network effort and slow
 *       down this script by a ton! There's a case to be made that it's a more
 *       robust idea, though, in case DTI's manifest cache is out of date. That
 *       might be worth checking! That said, the mitigation described at the
 *       top of the script (using `cache-asset-manifests`) is probably enough.
 */
function getHTML5UrlsFromManifestContent(assetId, manifestContent) {
  if (!manifestContent) {
    return [];
  }

  let manifestData;
  try {
    manifestData = JSON.parse(manifestContent);
  } catch (error) {
    console.warn(
      `[Asset ${assetId}] Could not parse manifest as JSON; ` +
        `skipping movie URLs: ${error.message}`
    );
    return [];
  }

  let paths;
  try {
    // NOTE: The format we store in the database is a bit different than the
    //       manifest.json's original format. We remove the cpmanifest wrapper
    //       object, we camelCase the keys, and we rename `url` to `path` for
    //       increased clarity about what it is.
    const assets = manifestData.assets;
    const assetDatas = assets.map((a) => a.assetData).flat();
    paths = assetDatas.map((ad) => ad.path);
  } catch (error) {
    console.warn(
      `[Asset ${assetId}] Manifest data was not of the expected structure; ` +
        `skipping movie URLs: ${error.message}.`
    );
    return [];
  }

  if (paths.length === 0) {
    console.warn(`[Asset ${assetId}] Manifest contained no URLs; skipping.`);
    return [];
  }

  // Starting with asset #539903, when manifests reference themselves, they
  // prepend `/home/images/public_html/` to the path, oops! Strip those out.
  paths = paths.map((p) => p.replace(PUBLIC_HTML_PATTERN, ""));

  // Some manifest files reference themselves, but others don't. If this one
  // doesn't, we can infer it from the other paths: it should be at
  // `manifest.json` in the same directory.
  //
  // We can't just infer it from e.g. the SWF URL, because there are two
  // locations where a manifest file can be. As an example, for the following
  // SWF URL:
  // - https://images.neopets.com/cp/items/swf/000/000/005/5735_a8feda8d08.swf
  // ...the manifest might be either of the following:
  // - https://images.neopets.com/cp/items/data/000/000/005/5735_a8feda8d08/manifest.json
  // - https://images.neopets.com/cp/items/data/000/000/005/5735/manifest.json
  // (incidentally, it's the first one in this case!)
  if (!paths.some((p) => p.split("?")[0].endsWith("manifest.json"))) {
    try {
      paths.push(inferManifestPath(paths));
    } catch (error) {
      console.warn(
        `[Asset ${assetId}] Skipping manifest URL: ${error.message}`
      );
    }
  }

  const urls = paths.map((path) =>
    new URL(path, "https://images.neopets.com/").toString()
  );

  // Sometimes the manifest contains some duplicate URLs, so remove those.
  const uniqueUrls = [...new Set(urls)];

  return uniqueUrls;
}

function inferManifestPath(paths) {
  // Sometimes there are subdirectories, so we can't *just* look at the
  // dirname of the files. Instead, we observe that assets generally go in a
  // folder like: `cp/<bio|object>/data/123/456/789/123456789[_abcdef1234]`,
  // so we look for that prefix in the first path, check all the other paths
  // against it for good measure, then assume that `manifest.json` will be in
  // there too.
  const baseDirectoryMatch = paths[0].match(ASSET_DIRNAME_PATTERN);
  if (baseDirectoryMatch == null) {
    throw new Error(
      `Could not infer manifest path: the paths in the ` +
        `manifest don't match the expected format. (The first path was: ` +
        `${JSON.stringify(paths[0])})`
    );
  }

  const baseDirectory = baseDirectoryMatch[0];
  if (!paths.every((p) => p.startsWith(baseDirectory))) {
    throw new Error(
      `Could not infer manifest path: the paths in the manifest don't all ` +
        `belong to the same base directory.`
    );
  }

  return `${baseDirectory}/manifest.json`;
}

/**
 * Ensure this is an https://images.neopets.com URL. If the protocol isn't
 * HTTPS, we fix it. If it's only a path, we'll fix that too. If the host isn't
 * images.neopets.com, or we can't parse it at all… well, there's not a lot we
 * can do, so we leave it intact but warn about the issue.
 */
function sanitizeUrl(url, contextString) {
  // Some of the URLs in our database are written with the shorthand `//`
  // prefix that directs the browser to use either HTTPS or HTTP depending on
  // how the page was loaded. This won't parse correctly, so we fix it first!
  if (url.startsWith("//")) {
    url = "https:" + url;
  }

  let parsedUrl;
  try {
    // This is where we fix path-only "URLs": by parsing them in the context of
    // the correct origin!
    parsedUrl = new URL(url, "https://images.neopets.com");
  } catch (error) {
    console.warn(
      `[${contextString}]: URL is not parseable, but we're saving it ` +
        `anyway: ${JSON.stringify(url)}`
    );
    return url;
  }

  parsedUrl.protocol = "https:";

  if (parsedUrl.host !== "images.neopets.com") {
    console.warn(
      `[${contextString}]: URL is not from images.neopets.com, but we're ` +
        `saving it anyway: ${JSON.stringify(url)}`
    );
  }

  return parsedUrl.toString();
}

main()
  .catch((e) => {
    console.error(e);
    process.exit(1);
  })
  .then((code = 0) => process.exit(code));
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								/**
-												Refactor archive scripts to prepare/create/upload

Sat down and thought about the structure here and how to make the full/delta stuff make more sense together! Here's what I came up with!

In both full and delta archiving, we prepare the manifest, we create the local archive, then we upload it to remote.

											
										
										
											2022-10-13 16:07:12 -07:00
+								 * archive:prepare:full generates a manifest-full.txt file, containing all of
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								 * the images.neopets.com URLs for customization that Dress to Impress is aware
 								 * of. This will enable us to back them all up in an archive!
 								 *
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								 * This script doesn't make any external network requests. Instead, it only
 								 * queries the DTI database, and extracts URLs from the data we already have.
 								 *
 								 * While this helps us be very fast, note that this depends on DTI's cached
 								 * manifests in the database being relatively up-to-date. It might be wise to
 								 * run the `cache-asset-manifests` script with the `--resync-existing` flag to
 								 * update DTI's manifest cache before running this, if you want to ensure it's
 								 * fully in sync!
 								 *
 								 * NOTE: This file will be in the hundreds of megabytes large! It's designed to
 								 *       be an easy cache to write and read, and the trade-off for that is
 								 *       being big and uncompressed. Delete it when you're done to save space!
 								 *
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								 * Asset types:
 								 *   - Item thumbnails
 								 *   - Asset SWFs
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								 *   - Asset HTML5 manifests
 								 *   - Asset HTML5 SVG images
 								 *   - Asset HTML5 movie files (images and scripts)
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								 */
 								const fs = require("fs/promises");
 								const path = require("path");
 								const connectToDb = require("../../../src/server/db");
 								const { normalizeRow } = require("../../../src/server/util");
 								async function main() {
 								  const db = await connectToDb();
 								  const file = await createUrlsCacheFile();
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								  try {
 								    const numItems = await loadNumItems(db);
 								    for (let i = 0; i < numItems; i += 1000) {
 								      console.info(`Loading items ${i + 1}–${i + 1000} of ${numItems}…`);
 								      const items = await loadItems(i, 1000, db);
-												Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!

											
										
										
											2022-09-12 20:52:45 -07:00
+								      const urls = [];
 								      for (const item of items) {
 								        const thumbnailUrl = sanitizeUrl(item.thumbnailUrl, `Item ${item.id}`);
 								        urls.push(thumbnailUrl);
 								      }
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								      const lines = urls.map((url) => url + "\n");
 								      await file.write(lines.join(""), null, "utf8");
 								    }
 								    console.info(`Done writing item URLs.`);
 								    const numSwfAssets = await loadNumSwfAssets(db);
 								    for (let i = 0; i < numSwfAssets; i += 1000) {
 								      console.info(`Loading assets ${i + 1}–${i + 1000} of ${numSwfAssets}…`);
 								      const swfAssets = await loadSwfAssets(i, 1000, db);
 								      const urls = [];
-												Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!

											
										
										
											2022-09-12 20:52:45 -07:00
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								      for (const swfAsset of swfAssets) {
-												Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!

											
										
										
											2022-09-12 20:52:45 -07:00
+								        const swfUrl = sanitizeUrl(swfAsset.url, `Asset ${swfAsset.id}`);
 								        const html5Urls = getHTML5UrlsFromManifestContent(
 								          swfAsset.id,
 								          swfAsset.manifest
 								        ).map((url) => sanitizeUrl(url, `Asset ${swfAsset.id}`));
 								        urls.push(swfUrl);
 								        urls.push(...html5Urls);
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								      }
-												Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!

											
										
										
											2022-09-12 20:52:45 -07:00
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								      const lines = urls.map((url) => url + "\n");
 								      await file.write(lines.join(""), null, "utf8");
 								    }
 								  } finally {
 								    await file.close();
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								  }
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
 								  console.info(`Done writing asset URLs.`);
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								}
 								async function createUrlsCacheFile() {
-												Refactor archive scripts to prepare/create/upload

Sat down and thought about the structure here and how to make the full/delta stuff make more sense together! Here's what I came up with!

In both full and delta archiving, we prepare the manifest, we create the local archive, then we upload it to remote.

											
										
										
											2022-10-13 16:07:12 -07:00
+								  const urlsCacheFilePath = path.join(__dirname, "../manifest-full.txt");
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								  return await fs.open(urlsCacheFilePath, "w");
 								}
 								async function loadNumItems(db) {
 								  const [rows] = await db.query(`SELECT count(*) FROM items`);
 								  return rows[0]["count(*)"];
 								}
 								async function loadItems(offset, limit, db) {
 								  const [
 								    rows,
 								  ] = await db.query(
-												Increase timeout in archive SQL queries

One problem I run into with the archive task is that sometimes the queries time out? My hunch is that maybe some of the assets have like, weirdly big manifest files that are being transferred as surprisingly big text files?

Anyway, I'm increasing the timeout to 20s, which is big, but big feels good for a script that doesn't run often and where failing is not great news!

I'm also idly considering whether I wanna finally put in the work to do a bulk S3 uploader sometime, because the current version that iterates over multiple `aws s3 cp` calls is just real slow, I think because it establishes a new connection each time, and that operation is maybe surprisingly expensive? And the CLI doesn't have a way to do multiple uploads any more precisely than "sync up this whole folder", which is slow when the folder contains a lot of stuff you _know_ you don't want to head up there.

											
										
										
											2023-06-12 10:56:40 -07:00
+								    {sql: `SELECT id, thumbnail_url FROM items ORDER BY id LIMIT ? OFFSET ?;`, timeout: 20000},
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								    [limit, offset]
 								  );
 								  return rows.map(normalizeRow);
 								}
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								async function loadNumSwfAssets(db) {
 								  const [rows] = await db.query(`SELECT count(*) FROM swf_assets`);
 								  return rows[0]["count(*)"];
 								}
 								async function loadSwfAssets(offset, limit, db) {
 								  const [
 								    rows,
 								  ] = await db.query(
-												Increase timeout in archive SQL queries

One problem I run into with the archive task is that sometimes the queries time out? My hunch is that maybe some of the assets have like, weirdly big manifest files that are being transferred as surprisingly big text files?

Anyway, I'm increasing the timeout to 20s, which is big, but big feels good for a script that doesn't run often and where failing is not great news!

I'm also idly considering whether I wanna finally put in the work to do a bulk S3 uploader sometime, because the current version that iterates over multiple `aws s3 cp` calls is just real slow, I think because it establishes a new connection each time, and that operation is maybe surprisingly expensive? And the CLI doesn't have a way to do multiple uploads any more precisely than "sync up this whole folder", which is slow when the folder contains a lot of stuff you _know_ you don't want to head up there.

											
										
										
											2023-06-12 10:56:40 -07:00
+								    {sql: `SELECT id, url, manifest FROM swf_assets ORDER BY id LIMIT ? OFFSET ?;`, timeout: 20000},
-												Add asset URLs to archive:create:list-urls script

Here, we read URLs out from the swf_assets table, including SWFs, manfests, and everything referenced by the manifests.

There are a few data-polishing tricks we needed to do to get this to work! Most notably, newer manfests reference themselves, but older ones don't; so we try to infer the manifest URL from the other URLs. (Our database caches the manifest content, but not the manifest URL it came from.)

											
										
										
											2022-09-12 17:26:11 -07:00
+								    [limit, offset]
 								  );
 								  return rows.map(normalizeRow);
 								}
 								const ASSET_DIRNAME_PATTERN = /^cp\/(bio|items)\/data\/[0-9]{3}\/[0-9]{3}\/[0-9]{3}\/[0-9]+(?:_[0-9a-f]{10})?/;
 								const PUBLIC_HTML_PATTERN = /^\/home\/images\/public_html\//;
 								/**
 								 * getHTML5UrlsFromManifestContent takes the manifest content cached in DTI's
 								 * database, and extracts the URLs it references, including the URL for the
 								 * manifest itself.
 								 *
 								 * NOTE: Another approach we could take here would be to, like DTI itself
 								 *       does, fetch both manifest locations and see which one answers. The
 								 *       clear downside is that this would be a lot of network effort and slow
 								 *       down this script by a ton! There's a case to be made that it's a more
 								 *       robust idea, though, in case DTI's manifest cache is out of date. That
 								 *       might be worth checking! That said, the mitigation described at the
 								 *       top of the script (using `cache-asset-manifests`) is probably enough.
 								 */
 								function getHTML5UrlsFromManifestContent(assetId, manifestContent) {
 								  if (!manifestContent) {
 								    return [];
 								  }
 								  let manifestData;
 								  try {
 								    manifestData = JSON.parse(manifestContent);
 								  } catch (error) {
 								    console.warn(
 								      `[Asset ${assetId}] Could not parse manifest as JSON; ` +
 								        `skipping movie URLs: ${error.message}`
 								    );
 								    return [];
 								  }
 								  let paths;
 								  try {
 								    // NOTE: The format we store in the database is a bit different than the
 								    //       manifest.json's original format. We remove the cpmanifest wrapper
 								    //       object, we camelCase the keys, and we rename `url` to `path` for
 								    //       increased clarity about what it is.
 								    const assets = manifestData.assets;
 								    const assetDatas = assets.map((a) => a.assetData).flat();
 								    paths = assetDatas.map((ad) => ad.path);
 								  } catch (error) {
 								    console.warn(
 								      `[Asset ${assetId}] Manifest data was not of the expected structure; ` +
 								        `skipping movie URLs: ${error.message}.`
 								    );
 								    return [];
 								  }
 								  if (paths.length === 0) {
 								    console.warn(`[Asset ${assetId}] Manifest contained no URLs; skipping.`);
 								    return [];
 								  }
 								  // Starting with asset #539903, when manifests reference themselves, they
 								  // prepend `/home/images/public_html/` to the path, oops! Strip those out.
 								  paths = paths.map((p) => p.replace(PUBLIC_HTML_PATTERN, ""));
 								  // Some manifest files reference themselves, but others don't. If this one
 								  // doesn't, we can infer it from the other paths: it should be at
 								  // `manifest.json` in the same directory.
 								  //
 								  // We can't just infer it from e.g. the SWF URL, because there are two
 								  // locations where a manifest file can be. As an example, for the following
 								  // SWF URL:
 								  // - https://images.neopets.com/cp/items/swf/000/000/005/5735_a8feda8d08.swf
 								  // ...the manifest might be either of the following:
 								  // - https://images.neopets.com/cp/items/data/000/000/005/5735_a8feda8d08/manifest.json
 								  // - https://images.neopets.com/cp/items/data/000/000/005/5735/manifest.json
 								  // (incidentally, it's the first one in this case!)
 								  if (!paths.some((p) => p.split("?")[0].endsWith("manifest.json"))) {
 								    try {
 								      paths.push(inferManifestPath(paths));
 								    } catch (error) {
 								      console.warn(
 								        `[Asset ${assetId}] Skipping manifest URL: ${error.message}`
 								      );
 								    }
 								  }
 								  const urls = paths.map((path) =>
 								    new URL(path, "https://images.neopets.com/").toString()
 								  );
 								  // Sometimes the manifest contains some duplicate URLs, so remove those.
 								  const uniqueUrls = [...new Set(urls)];
 								  return uniqueUrls;
 								}
 								function inferManifestPath(paths) {
 								  // Sometimes there are subdirectories, so we can't *just* look at the
 								  // dirname of the files. Instead, we observe that assets generally go in a
 								  // folder like: `cp/<bio|object>/data/123/456/789/123456789[_abcdef1234]`,
 								  // so we look for that prefix in the first path, check all the other paths
 								  // against it for good measure, then assume that `manifest.json` will be in
 								  // there too.
 								  const baseDirectoryMatch = paths[0].match(ASSET_DIRNAME_PATTERN);
 								  if (baseDirectoryMatch == null) {
 								    throw new Error(
 								      `Could not infer manifest path: the paths in the ` +
 								        `manifest don't match the expected format. (The first path was: ` +
 								        `${JSON.stringify(paths[0])})`
 								    );
 								  }
 								  const baseDirectory = baseDirectoryMatch[0];
 								  if (!paths.every((p) => p.startsWith(baseDirectory))) {
 								    throw new Error(
 								      `Could not infer manifest path: the paths in the manifest don't all ` +
 								        `belong to the same base directory.`
 								    );
 								  }
 								  return `${baseDirectory}/manifest.json`;
 								}
-												Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!

											
										
										
											2022-09-12 20:52:45 -07:00
+								/**
 								 * Ensure this is an https://images.neopets.com URL. If the protocol isn't
 								 * HTTPS, we fix it. If it's only a path, we'll fix that too. If the host isn't
 								 * images.neopets.com, or we can't parse it at all… well, there's not a lot we
 								 * can do, so we leave it intact but warn about the issue.
 								 */
 								function sanitizeUrl(url, contextString) {
 								  // Some of the URLs in our database are written with the shorthand `//`
 								  // prefix that directs the browser to use either HTTPS or HTTP depending on
 								  // how the page was loaded. This won't parse correctly, so we fix it first!
 								  if (url.startsWith("//")) {
 								    url = "https:" + url;
 								  }
 								  let parsedUrl;
 								  try {
 								    // This is where we fix path-only "URLs": by parsing them in the context of
 								    // the correct origin!
 								    parsedUrl = new URL(url, "https://images.neopets.com");
 								  } catch (error) {
 								    console.warn(
 								      `[${contextString}]: URL is not parseable, but we're saving it ` +
 								        `anyway: ${JSON.stringify(url)}`
 								    );
 								    return url;
 								  }
 								  parsedUrl.protocol = "https:";
 								  if (parsedUrl.host !== "images.neopets.com") {
 								    console.warn(
 								      `[${contextString}]: URL is not from images.neopets.com, but we're ` +
 								        `saving it anyway: ${JSON.stringify(url)}`
 								    );
 								  }
 								  return parsedUrl.toString();
 								}
-												Start building archive:create:list-urls script

Just working on making an images.neopets.com mirror, just in case! To start, I'm extracting all the URLs we need to back up; and then I'll make a separate script whose job is to mirror all of the URLs in the list.

											
										
										
											2022-09-12 15:53:22 -07:00
+								main()
 								  .catch((e) => {
 								    console.error(e);
 								    process.exit(1);
 								  })
 								  .then((code = 0) => process.exit(code));