impress-2020/pages/api/readFromArchive.js

/**
 * /api/readFromArchive reads a file from our images.neopets.com archive.
 *
 * Parameters:
 *   - url: The original canonical URL of the image,
 *          e.g. "https://images.neopets.com/items/acbg_virtupets.gif".
 *
 * One might wonder, why have this proxy at all? Why not just use the storage
 * service's own lil HTTP server? My main reason is to have one more lil
 * gate to control access, if it turns out someone tries to burn our egress
 * costs and we need to shut them off! And just in general, I don't like
 * relying on "public ACL" default behavior, it's scary :p
 *
 * For *actual* sharing of the whole archive, we'd want to like, put a .zip
 * file somewhere for people to download, from a service optimized for large
 * downloads.
 *
 * We don't check the incoming request very closely, because like, anything in
 * the archive bucket *is* meant to be read, there aren't secrets in it. And
 * this endpoint doesn't have any dangerous permissions, just endpoints for the
 * bucket. So even if someone did something clever with a URL to trick us into
 * loading the "wrong" thing, I don't see what they'd even *want*, y'know?
 */
const beeline = require("honeycomb-beeline")({
  writeKey: process.env["HONEYCOMB_WRITE_KEY"],
  dataset:
    process.env["NODE_ENV"] === "production"
      ? "Dress to Impress (2020)"
      : "Dress to Impress (2020, dev)",
  serviceName: "impress-2020-gql-server",
});
import * as path from "node:path";
import { GetObjectCommand, S3Client } from "@aws-sdk/client-s3";

async function handle(req, res) {
  const { url } = req.query;

  let parsedUrl;
  try {
    parsedUrl = new URL(url);
  } catch (error) {
    res.end(`Error parsing URL parameter: ${error.message}`);
    return;
  }

  // An archive key is like: "images.neopets.com/cp/data/1/2/3/atlas.png?v=123".
  // So basically, just the host, path, and query string; no protocol or hash.
  // (Most archive keys don't have a query string, but some do! And don't
  // worry, `parsedUrl.search` will just be empty if there's no query string.)
  //
  // The archive key doesn't include URL encoding (e.g. it has " " instead of
  // "%20"), but the URI object gives us encoded paths and queries for safety
  // by default, so we need to decode them when constructing the key!
  const archiveKey =
    parsedUrl.host +
    decodeURIComponent(parsedUrl.pathname) +
    decodeURIComponent(parsedUrl.search);
  let archiveRes;
  try {
    archiveRes = await fetchFromArchive(archiveKey);
  } catch (error) {
    if (error.Code === "NoSuchKey") {
      res.status(404);
      res.end();
      return;
    }

    res.end(`Error loading file from archive: ${error.message}`);
    return;
  }

  // Set the Content-Type so the browser interprets it correctly, and also tell
  // it the filename via Content-Disposition in case the browser chooses to
  // download it immediately (like with SWFs) or if the user chooses to Save As
  // it later.
  const filename = path.basename(archiveKey.split("?")[0]);
  const contentType = inferContentType(filename);
  res.setHeader("Content-Type", contentType);
  res.setHeader("Content-Disposition", `inline; filename=${filename}`);

  // Send a long-term cache header, like images.neopets.com does! We assume
  // that, if they change an asset, they'll change the query string to bust the
  // cache, and so we'll get to see that change in the updated archive too.
  res.setHeader("Cache-Control", "public, max-age=31536000, immutable");
  res.send(archiveRes.Body);
}

async function fetchFromArchive(archiveKey) {
  const storage = new S3Client({
    region: process.env.ARCHIVE_STORAGE_REGION,
    endpoint: `https://${process.env.ARCHIVE_STORAGE_HOST}`,
    credentials: {
      accessKeyId: process.env.ARCHIVE_STORAGE_READ_ACCESS_KEY,
      secretAccessKey: process.env.ARCHIVE_STORAGE_READ_SECRET_KEY,
    },
  });

  const res = await storage.send(
    new GetObjectCommand({
      Bucket: process.env.ARCHIVE_STORAGE_BUCKET,
      Key: archiveKey,
    })
  );
  return res;
}

function inferContentType(filename) {
  // So, S3 objects *do* have a ContentType field, but our upload script
  // doesn't infer it correctly for the filenames that include query strings.
  // Rather than write our own uploader or something to fix the ContentType in
  // S3 after the fact (the latter of which wouldn't be unreasonable and might
  // be a better approach), we just take advantage of the fact that the archive
  // only *really* has a small handful of filetypes that we would need to serve
  // to DTI, and that serving as `application/octet-stream` isn't unreasonable
  // for other kinds of files we might have.
  const fileExtension = path.extname(filename);
  if (fileExtension === ".json") {
    return "application/json";
  } else if (fileExtension === ".js") {
    return "text/javascript";
  } else if (fileExtension === ".png") {
    return "image/png";
  } else if (fileExtension === ".gif") {
    return "image/gif";
  } else if (fileExtension === ".svg") {
    return "image/svg+xml";
  } else if (fileExtension === ".jpeg" || fileExtension === ".jpg") {
    return "image/jpeg";
  } else if (fileExtension === ".swf") {
    return "application/x-shockwave-flash";
  } else if (fileExtension === ".mp3") {
    return "audio/mpeg3";
  } else {
    return "application/octet-stream";
  }
}

async function handleWithBeeline(req, res) {
  beeline.withTrace(
    { name: "api/readFromArchive", operation_name: "api/readFromArchive" },
    () => handle(req, res)
  );
}

export default handleWithBeeline;