From 0e8e50b05452a95b6bc9d4a43c92cf6d138aaac6 Mon Sep 17 00:00:00 2001
From: Matchu <matchu1993@gmail.com>
Date: Thu, 18 Mar 2021 12:57:29 -0700
Subject: [PATCH] Simpler, faster modeling query
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I narrowed down the problem to the fact that we were joining in pet types against assets, and *then* running GROUP and DISTINCT and everything. Assets x compatible species/color pairs is a LOT of rows!

Here, we instead get all the relevant body IDs first, and *then* match them against pet types—which we fetch in one batch to match body to canonical species/color.

I'm also trashing the weird caching mechanism we did here, because in practice it doesn't seem reliable anyway. If anything, I'd want to look at stronger CDN caching. (I made a small improvement to the caching annotation, but ultimately it still doesn't matter, because this query uses logged-in stuff and always comes out max-age=0 anyway.)
---
 scripts/setup-mysql.sql  | 69 -------------------------------
 src/server/loaders.js    | 89 +++++++++++++++++++++++-----------------
 src/server/types/Item.js |  2 +-
 3 files changed, 53 insertions(+), 107 deletions(-)

diff --git a/scripts/setup-mysql.sql b/scripts/setup-mysql.sql
index ff19a19..8d550c9 100644
--- a/scripts/setup-mysql.sql
+++ b/scripts/setup-mysql.sql
@@ -35,72 +35,3 @@ GRANT SELECT, UPDATE ON openneo_id.users TO impress2020;
 
 -- mysqldump
 GRANT LOCK TABLES ON * TO impress2020;
-
--- Procedures used in the application
-
-DELIMITER $$
-
-DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsV2$$
-CREATE PROCEDURE GetItemsThatNeedModelsV2()
-BEGIN
-  SELECT pet_types.color_id AS color_id, items.id AS item_id,
-    GROUP_CONCAT(DISTINCT pet_types.species_id ORDER BY pet_types.species_id)
-      AS modeled_species_ids,
-    -- Vandagyre was added on 2014-11-14, so we add some buffer here.
-    -- TODO: Some later Dyeworks items don't support Vandagyre.
-    -- Add a manual db flag?
-    items.created_at >= "2014-12-01" AS supports_vandagyre
-  FROM items
-  INNER JOIN parents_swf_assets psa
-    ON psa.parent_type = "Item" AND psa.parent_id = items.id
-  INNER JOIN swf_assets
-    ON swf_assets.id = psa.swf_asset_id
-  INNER JOIN pet_types
-    ON pet_types.body_id = swf_assets.body_id
-  WHERE
-    pet_types.color_id IN (
-       8, -- Blue (Standard)
-       6, -- Baby
-      44, -- Maraquan
-      46  -- Mutant
-    )
-    AND items.modeling_status_hint IS NULL
-  GROUP BY color_id, item_id
-  HAVING
-    NOT (
-      -- No species (either an All Bodies item, or a Capsule type thing)
-      count(DISTINCT pet_types.species_id) = 0
-      -- Single species (probably just their item)
-      OR count(DISTINCT pet_types.species_id) = 1
-      -- All species modeled
-      OR count(DISTINCT pet_types.species_id) = 55
-      -- All species modeled except Vandagyre, for items that don't support it
-      OR (NOT supports_vandagyre AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
-    )
-  ORDER BY color_id, item_id;
-END$$
-GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsV2 TO impress2020$$
-
--- This procedure is a performance optimization! We want the page to always be
--- up-to-date, but we want to avoid re-running this query if modeling data
--- hasn't changed. So, we use the last contribution timestamp as a cache hint
--- for whether the data has expired. But in this environment, sequential
--- queries are a bottleneck, so I bundled up that logic into this single
--- procedure that can run on the database! That way, it's just one network
--- round-trip instead of two, which makes a noticeable difference in our stack.
-DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsIfNotCachedV2$$
-CREATE PROCEDURE GetItemsThatNeedModelsIfNotCachedV2(
-  IN last_known_update TIMESTAMP,
-  OUT last_actual_update TIMESTAMP
-)
-BEGIN
-  SET last_actual_update =
-    (SELECT created_at FROM contributions ORDER BY id DESC LIMIT 1);
-
-  IF last_known_update < last_actual_update THEN
-    CALL GetItemsThatNeedModelsV2();
-  END IF;
-END$$
-GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsIfNotCachedV2 TO impress2020$$
-
-DELIMITER ;
diff --git a/src/server/loaders.js b/src/server/loaders.js
index ce788dd..f017eef 100644
--- a/src/server/loaders.js
+++ b/src/server/loaders.js
@@ -410,8 +410,6 @@ const buildNewestItemsLoader = (db, loaders) =>
     return [entities];
   });
 
-let lastKnownUpdate = "1970-01-01"; // start it out very old!
-let lastResult = new Map();
 const buildItemsThatNeedModelsLoader = (db) =>
   new DataLoader(async (keys) => {
     // Essentially, I want to take easy advantage of DataLoader's caching, for
@@ -421,48 +419,65 @@ const buildItemsThatNeedModelsLoader = (db) =>
       throw new Error(`this loader can only be loaded with the key "all"`);
     }
 
-    // Call the query as a procedure, defined in `setup-mysql.sql`. It will
-    // only run the query if modeling data has been changed since the timestamp
-    // we provide; otherwise, it skips the query and returns no rows, which is
-    // much faster! (The query takes a few seconds to run.)
-    //
-    // NOTE: This query has the colors hardcoded, we always fetch all of them!
-    //       And then we look up the specific colors
-    const [results, _] = await db.query(
+    const [rows, _] = await db.query(
+      `
+        SELECT T_ITEMS.item_id,
+          T_BODIES.color_id,
+          T_ITEMS.supports_vandagyre,
+          COUNT(*) AS modeled_species_count,
+          GROUP_CONCAT(
+            T_BODIES.species_id
+            ORDER BY T_BODIES.species_id
+          ) AS modeled_species_ids
+        FROM (
+          -- NOTE: I found that extracting this as a separate query that runs
+          --       first made things WAAAY faster. Less to join/group, I guess?
+          SELECT DISTINCT items.id AS item_id,
+            swf_assets.body_id AS body_id,
+            -- Vandagyre was added on 2014-11-14, so we add some buffer here.
+            -- TODO: Some later Dyeworks items don't support Vandagyre.
+            -- Add a manual db flag?
+            items.created_at >= "2014-12-01" AS supports_vandagyre
+          FROM items
+          INNER JOIN parents_swf_assets psa ON psa.parent_type = "Item"
+            AND psa.parent_id = items.id
+          INNER JOIN swf_assets ON swf_assets.id = psa.swf_asset_id
+          INNER JOIN item_translations it ON it.item_id = items.id AND it.locale = "en"
+          WHERE items.modeling_status_hint IS NULL AND it.name NOT LIKE "%MME%"
+          ORDER BY item_id
+        ) T_ITEMS
+        INNER JOIN (
+          SELECT DISTINCT body_id, species_id, color_id
+          FROM pet_types
+          WHERE color_id IN (6, 8, 44, 46)
+          ORDER BY body_id, species_id
+        ) T_BODIES ON T_ITEMS.body_id = T_BODIES.body_id
+        GROUP BY T_ITEMS.item_id, T_BODIES.color_id
+        HAVING NOT (
+          -- No species (either an All Bodies item, or a Capsule type thing)
+          modeled_species_count = 0
+          -- Single species (probably just their item)
+          OR modeled_species_count = 1
+          -- All species modeled
+          OR modeled_species_count = 55
+          -- All species modeled except Vandagyre, for items that don't support it
+          OR (NOT T_ITEMS.supports_vandagyre AND modeled_species_count = 54 AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
+        )
+        ORDER BY T_ITEMS.item_id;
       `
-        CALL GetItemsThatNeedModelsIfNotCachedV2(?, @LastActualUpdate);
-        SELECT @LastActualUpdate;
-      `,
-      [lastKnownUpdate]
     );
 
-    // The query will return 2 or 3 results.
-    // Result 1 (optional): The rows produced by the CALL, if it ran the query.
-    //                      Or, if it skipped the query, this is omitted.
-    // Result 2 (required): The MySQL summary of the effects of the CALL.
-    // Result 3 (required): The 1-row table contianing @LastActualUpdate.
-    //
-    // So, check the number of results. If it's 3, then we should update our
-    // cache. Or, if it's 2, then there was no change and we can continue with
-    // the existing cached value.
-    if (results.length === 3) {
-      const [rawRows, __, varRows] = results;
-      const rows = rawRows.map(normalizeRow);
+    const entities = rows.map(normalizeRow);
 
-      lastKnownUpdate = varRows[0]["@LastActualUpdate"];
-
-      // We build lastResult into a Map up-front, to speed up the many lookups
-      // that the GQL resolvers will do as we group this data into GQL nodes!
-      lastResult = new Map();
-      for (const { colorId, itemId, ...row } of rows) {
-        if (!lastResult.has(colorId)) {
-          lastResult.set(colorId, new Map());
-        }
-        lastResult.get(colorId).set(itemId, row);
+    const result = new Map();
+    for (const { colorId, itemId, ...entity } of entities) {
+      if (!result.has(colorId)) {
+        result.set(colorId, new Map());
       }
+      result.get(colorId).set(itemId, entity);
     }
 
-    return [lastResult];
+    return [result];
   });
 
 const buildItemBodiesWithAppearanceDataLoader = (db) =>
diff --git a/src/server/types/Item.js b/src/server/types/Item.js
index 4a67461..8f55824 100644
--- a/src/server/types/Item.js
+++ b/src/server/types/Item.js
@@ -192,7 +192,7 @@ const typeDefs = gql`
     # NOTE: Most color IDs won't be accepted here. Either pass the ID of a
     #       major special color like Baby (#6), or leave it blank for standard
     #       bodies like Blue, Green, Red, etc.
-    itemsThatNeedModels(colorId: ID): [Item!]!
+    itemsThatNeedModels(colorId: ID): [Item!]! @cacheControl(maxAge: 1, staleWhileRevalidate: ${oneHour})
   }
 
   extend type Mutation {