Simpler, faster modeling query

I narrowed down the problem to the fact that we were joining in pet types against assets, and *then* running GROUP and DISTINCT and everything. Assets x compatible species/color pairs is a LOT of rows! Here, we instead get all the relevant body IDs first, and *then* match them against pet types—which we fetch in one batch to match body to canonical species/color. I'm also trashing the weird caching mechanism we did here, because in practice it doesn't seem reliable anyway. If anything, I'd want to look at stronger CDN caching. (I made a small improvement to the caching annotation, but ultimately it still doesn't matter, because this query uses logged-in stuff and always comes out max-age=0 anyway.)
2021-03-18 12:57:29 -07:00 · 2021-03-18 12:57:29 -07:00 · 0e8e50b054
commit 0e8e50b054
parent 1f103989a3
3 changed files with 53 additions and 107 deletions
--- a/scripts/setup-mysql.sql
+++ b/scripts/setup-mysql.sql
@ -35,72 +35,3 @@ GRANT SELECT, UPDATE ON openneo_id.users TO impress2020;

 -- mysqldump
 GRANT LOCK TABLES ON * TO impress2020;
-
-- Procedures used in the application
-
-DELIMITER $$
-
-DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsV2$$
-CREATE PROCEDURE GetItemsThatNeedModelsV2()
-BEGIN
-  SELECT pet_types.color_id AS color_id, items.id AS item_id,
-    GROUP_CONCAT(DISTINCT pet_types.species_id ORDER BY pet_types.species_id)
-      AS modeled_species_ids,
-    -- Vandagyre was added on 2014-11-14, so we add some buffer here.
-    -- TODO: Some later Dyeworks items don't support Vandagyre.
-    -- Add a manual db flag?
-    items.created_at >= "2014-12-01" AS supports_vandagyre
-  FROM items
-  INNER JOIN parents_swf_assets psa
-    ON psa.parent_type = "Item" AND psa.parent_id = items.id
-  INNER JOIN swf_assets
-    ON swf_assets.id = psa.swf_asset_id
-  INNER JOIN pet_types
-    ON pet_types.body_id = swf_assets.body_id
-  WHERE
-    pet_types.color_id IN (
-       8, -- Blue (Standard)
-       6, -- Baby
-      44, -- Maraquan
-      46  -- Mutant
-    )
-    AND items.modeling_status_hint IS NULL
-  GROUP BY color_id, item_id
-  HAVING
-    NOT (
-      -- No species (either an All Bodies item, or a Capsule type thing)
-      count(DISTINCT pet_types.species_id) = 0
-      -- Single species (probably just their item)
-      OR count(DISTINCT pet_types.species_id) = 1
-      -- All species modeled
-      OR count(DISTINCT pet_types.species_id) = 55
-      -- All species modeled except Vandagyre, for items that don't support it
-      OR (NOT supports_vandagyre AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
-    )
-  ORDER BY color_id, item_id;
-END$$
-GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsV2 TO impress2020$$
-
-- This procedure is a performance optimization! We want the page to always be
-- up-to-date, but we want to avoid re-running this query if modeling data
-- hasn't changed. So, we use the last contribution timestamp as a cache hint
-- for whether the data has expired. But in this environment, sequential
-- queries are a bottleneck, so I bundled up that logic into this single
-- procedure that can run on the database! That way, it's just one network
-- round-trip instead of two, which makes a noticeable difference in our stack.
-DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsIfNotCachedV2$$
-CREATE PROCEDURE GetItemsThatNeedModelsIfNotCachedV2(
-  IN last_known_update TIMESTAMP,
-  OUT last_actual_update TIMESTAMP
-)
-BEGIN
-  SET last_actual_update =
-    (SELECT created_at FROM contributions ORDER BY id DESC LIMIT 1);
-
-  IF last_known_update < last_actual_update THEN
-    CALL GetItemsThatNeedModelsV2();
-  END IF;
-END$$
-GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsIfNotCachedV2 TO impress2020$$
-
-DELIMITER ;
--- a/src/server/loaders.js
+++ b/src/server/loaders.js
@ -410,8 +410,6 @@ const buildNewestItemsLoader = (db, loaders) =>
    return [entities];
  });

-let lastKnownUpdate = "1970-01-01"; // start it out very old!
-let lastResult = new Map();
 const buildItemsThatNeedModelsLoader = (db) =>
  new DataLoader(async (keys) => {
    // Essentially, I want to take easy advantage of DataLoader's caching, for
@ -421,48 +419,65 @@ const buildItemsThatNeedModelsLoader = (db) =>
      throw new Error(`this loader can only be loaded with the key "all"`);
    }

-    // Call the query as a procedure, defined in `setup-mysql.sql`. It will
-    // only run the query if modeling data has been changed since the timestamp
-    // we provide; otherwise, it skips the query and returns no rows, which is
-    // much faster! (The query takes a few seconds to run.)
-    //
-    // NOTE: This query has the colors hardcoded, we always fetch all of them!
-    //       And then we look up the specific colors
-    const [results, _] = await db.query(
+    const [rows, _] = await db.query(
+      `
+        SELECT T_ITEMS.item_id,
+          T_BODIES.color_id,
+          T_ITEMS.supports_vandagyre,
+          COUNT(*) AS modeled_species_count,
+          GROUP_CONCAT(
+            T_BODIES.species_id
+            ORDER BY T_BODIES.species_id
+          ) AS modeled_species_ids
+        FROM (
+          -- NOTE: I found that extracting this as a separate query that runs
+          --       first made things WAAAY faster. Less to join/group, I guess?
+          SELECT DISTINCT items.id AS item_id,
+            swf_assets.body_id AS body_id,
+            -- Vandagyre was added on 2014-11-14, so we add some buffer here.
+            -- TODO: Some later Dyeworks items don't support Vandagyre.
+            -- Add a manual db flag?
+            items.created_at >= "2014-12-01" AS supports_vandagyre
+          FROM items
+          INNER JOIN parents_swf_assets psa ON psa.parent_type = "Item"
+            AND psa.parent_id = items.id
+          INNER JOIN swf_assets ON swf_assets.id = psa.swf_asset_id
+          INNER JOIN item_translations it ON it.item_id = items.id AND it.locale = "en"
+          WHERE items.modeling_status_hint IS NULL AND it.name NOT LIKE "%MME%"
+          ORDER BY item_id
+        ) T_ITEMS
+        INNER JOIN (
+          SELECT DISTINCT body_id, species_id, color_id
+          FROM pet_types
+          WHERE color_id IN (6, 8, 44, 46)
+          ORDER BY body_id, species_id
+        ) T_BODIES ON T_ITEMS.body_id = T_BODIES.body_id
+        GROUP BY T_ITEMS.item_id, T_BODIES.color_id
+        HAVING NOT (
+          -- No species (either an All Bodies item, or a Capsule type thing)
+          modeled_species_count = 0
+          -- Single species (probably just their item)
+          OR modeled_species_count = 1
+          -- All species modeled
+          OR modeled_species_count = 55
+          -- All species modeled except Vandagyre, for items that don't support it
+          OR (NOT T_ITEMS.supports_vandagyre AND modeled_species_count = 54 AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
+        )
+        ORDER BY T_ITEMS.item_id;
      `
-        CALL GetItemsThatNeedModelsIfNotCachedV2(?, @LastActualUpdate);
-        SELECT @LastActualUpdate;
-      `,
-      [lastKnownUpdate]
    );

-    // The query will return 2 or 3 results.
-    // Result 1 (optional): The rows produced by the CALL, if it ran the query.
-    //                      Or, if it skipped the query, this is omitted.
-    // Result 2 (required): The MySQL summary of the effects of the CALL.
-    // Result 3 (required): The 1-row table contianing @LastActualUpdate.
-    //
-    // So, check the number of results. If it's 3, then we should update our
-    // cache. Or, if it's 2, then there was no change and we can continue with
-    // the existing cached value.
-    if (results.length === 3) {
-      const [rawRows, __, varRows] = results;
-      const rows = rawRows.map(normalizeRow);
+    const entities = rows.map(normalizeRow);

-      lastKnownUpdate = varRows[0]["@LastActualUpdate"];
-
-      // We build lastResult into a Map up-front, to speed up the many lookups
-      // that the GQL resolvers will do as we group this data into GQL nodes!
-      lastResult = new Map();
-      for (const { colorId, itemId, ...row } of rows) {
-        if (!lastResult.has(colorId)) {
-          lastResult.set(colorId, new Map());
-        }
-        lastResult.get(colorId).set(itemId, row);
+    const result = new Map();
+    for (const { colorId, itemId, ...entity } of entities) {
+      if (!result.has(colorId)) {
+        result.set(colorId, new Map());
      }
+      result.get(colorId).set(itemId, entity);
    }

-    return [lastResult];
+    return [result];
  });

 const buildItemBodiesWithAppearanceDataLoader = (db) =>
--- a/src/server/types/Item.js
+++ b/src/server/types/Item.js
@ -192,7 +192,7 @@ const typeDefs = gql`
    # NOTE: Most color IDs won't be accepted here. Either pass the ID of a
    #       major special color like Baby (#6), or leave it blank for standard
    #       bodies like Blue, Green, Red, etc.
-    itemsThatNeedModels(colorId: ID): [Item!]!
+    itemsThatNeedModels(colorId: ID): [Item!]! @cacheControl(maxAge: 1, staleWhileRevalidate: ${oneHour})
  }

  extend type Mutation {