Sanitize URLs saved by archive:create:list-urls

Especially in our item thumbnails, there's a lot of messiness about what the URL protocol is. There are also some SWF assets whose "URLs" are just saved as paths.

In this change, we start processing all our outputted URLs through a `sanitizeUrl` function, which tries to massage it into an `https://images.neopets.com` URL, and warns if it cannot.

This also warns on some intentionally-different URLs, like our April Fools prank item lol

Anyway, I love functions like this, because the warnings always help me discover the data problems! I wasn't aware of the path-only SWF URLs, for example, until this script started warning about the URL parse errors!
This commit is contained in:
Emi Matchu 2022-09-12 20:52:45 -07:00
parent ef9958c11e
commit ea8715cd90

View file

@ -46,7 +46,11 @@ async function main() {
for (let i = 0; i < numItems; i += 1000) {
console.info(`Loading items ${i + 1}${i + 1000} of ${numItems}`);
const items = await loadItems(i, 1000, db);
const urls = items.map((i) => i.thumbnailUrl);
const urls = [];
for (const item of items) {
const thumbnailUrl = sanitizeUrl(item.thumbnailUrl, `Item ${item.id}`);
urls.push(thumbnailUrl);
}
const lines = urls.map((url) => url + "\n");
await file.write(lines.join(""), null, "utf8");
}
@ -57,12 +61,18 @@ async function main() {
console.info(`Loading assets ${i + 1}${i + 1000} of ${numSwfAssets}`);
const swfAssets = await loadSwfAssets(i, 1000, db);
const urls = [];
for (const swfAsset of swfAssets) {
urls.push(swfAsset.url);
urls.push(
...getHTML5UrlsFromManifestContent(swfAsset.id, swfAsset.manifest)
);
const swfUrl = sanitizeUrl(swfAsset.url, `Asset ${swfAsset.id}`);
const html5Urls = getHTML5UrlsFromManifestContent(
swfAsset.id,
swfAsset.manifest
).map((url) => sanitizeUrl(url, `Asset ${swfAsset.id}`));
urls.push(swfUrl);
urls.push(...html5Urls);
}
const lines = urls.map((url) => url + "\n");
await file.write(lines.join(""), null, "utf8");
}
@ -97,7 +107,7 @@ async function loadItems(offset, limit, db) {
const [
rows,
] = await db.query(
`SELECT thumbnail_url FROM items ORDER BY id LIMIT ? OFFSET ?;`,
`SELECT id, thumbnail_url FROM items ORDER BY id LIMIT ? OFFSET ?;`,
[limit, offset]
);
return rows.map(normalizeRow);
@ -235,6 +245,45 @@ function inferManifestPath(paths) {
return `${baseDirectory}/manifest.json`;
}
/**
* Ensure this is an https://images.neopets.com URL. If the protocol isn't
* HTTPS, we fix it. If it's only a path, we'll fix that too. If the host isn't
* images.neopets.com, or we can't parse it at all… well, there's not a lot we
* can do, so we leave it intact but warn about the issue.
*/
function sanitizeUrl(url, contextString) {
// Some of the URLs in our database are written with the shorthand `//`
// prefix that directs the browser to use either HTTPS or HTTP depending on
// how the page was loaded. This won't parse correctly, so we fix it first!
if (url.startsWith("//")) {
url = "https:" + url;
}
let parsedUrl;
try {
// This is where we fix path-only "URLs": by parsing them in the context of
// the correct origin!
parsedUrl = new URL(url, "https://images.neopets.com");
} catch (error) {
console.warn(
`[${contextString}]: URL is not parseable, but we're saving it ` +
`anyway: ${JSON.stringify(url)}`
);
return url;
}
parsedUrl.protocol = "https:";
if (parsedUrl.host !== "images.neopets.com") {
console.warn(
`[${contextString}]: URL is not from images.neopets.com, but we're ` +
`saving it anyway: ${JSON.stringify(url)}`
);
}
return parsedUrl.toString();
}
main()
.catch((e) => {
console.error(e);