From 0e8e50b05452a95b6bc9d4a43c92cf6d138aaac6 Mon Sep 17 00:00:00 2001 From: Matchu Date: Thu, 18 Mar 2021 12:57:29 -0700 Subject: [PATCH] Simpler, faster modeling query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I narrowed down the problem to the fact that we were joining in pet types against assets, and *then* running GROUP and DISTINCT and everything. Assets x compatible species/color pairs is a LOT of rows! Here, we instead get all the relevant body IDs first, and *then* match them against pet types—which we fetch in one batch to match body to canonical species/color. I'm also trashing the weird caching mechanism we did here, because in practice it doesn't seem reliable anyway. If anything, I'd want to look at stronger CDN caching. (I made a small improvement to the caching annotation, but ultimately it still doesn't matter, because this query uses logged-in stuff and always comes out max-age=0 anyway.) --- scripts/setup-mysql.sql | 69 ------------------------------- src/server/loaders.js | 89 +++++++++++++++++++++++----------------- src/server/types/Item.js | 2 +- 3 files changed, 53 insertions(+), 107 deletions(-) diff --git a/scripts/setup-mysql.sql b/scripts/setup-mysql.sql index ff19a19..8d550c9 100644 --- a/scripts/setup-mysql.sql +++ b/scripts/setup-mysql.sql @@ -35,72 +35,3 @@ GRANT SELECT, UPDATE ON openneo_id.users TO impress2020; -- mysqldump GRANT LOCK TABLES ON * TO impress2020; - --- Procedures used in the application - -DELIMITER $$ - -DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsV2$$ -CREATE PROCEDURE GetItemsThatNeedModelsV2() -BEGIN - SELECT pet_types.color_id AS color_id, items.id AS item_id, - GROUP_CONCAT(DISTINCT pet_types.species_id ORDER BY pet_types.species_id) - AS modeled_species_ids, - -- Vandagyre was added on 2014-11-14, so we add some buffer here. - -- TODO: Some later Dyeworks items don't support Vandagyre. - -- Add a manual db flag? - items.created_at >= "2014-12-01" AS supports_vandagyre - FROM items - INNER JOIN parents_swf_assets psa - ON psa.parent_type = "Item" AND psa.parent_id = items.id - INNER JOIN swf_assets - ON swf_assets.id = psa.swf_asset_id - INNER JOIN pet_types - ON pet_types.body_id = swf_assets.body_id - WHERE - pet_types.color_id IN ( - 8, -- Blue (Standard) - 6, -- Baby - 44, -- Maraquan - 46 -- Mutant - ) - AND items.modeling_status_hint IS NULL - GROUP BY color_id, item_id - HAVING - NOT ( - -- No species (either an All Bodies item, or a Capsule type thing) - count(DISTINCT pet_types.species_id) = 0 - -- Single species (probably just their item) - OR count(DISTINCT pet_types.species_id) = 1 - -- All species modeled - OR count(DISTINCT pet_types.species_id) = 55 - -- All species modeled except Vandagyre, for items that don't support it - OR (NOT supports_vandagyre AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54") - ) - ORDER BY color_id, item_id; -END$$ -GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsV2 TO impress2020$$ - --- This procedure is a performance optimization! We want the page to always be --- up-to-date, but we want to avoid re-running this query if modeling data --- hasn't changed. So, we use the last contribution timestamp as a cache hint --- for whether the data has expired. But in this environment, sequential --- queries are a bottleneck, so I bundled up that logic into this single --- procedure that can run on the database! That way, it's just one network --- round-trip instead of two, which makes a noticeable difference in our stack. -DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsIfNotCachedV2$$ -CREATE PROCEDURE GetItemsThatNeedModelsIfNotCachedV2( - IN last_known_update TIMESTAMP, - OUT last_actual_update TIMESTAMP -) -BEGIN - SET last_actual_update = - (SELECT created_at FROM contributions ORDER BY id DESC LIMIT 1); - - IF last_known_update < last_actual_update THEN - CALL GetItemsThatNeedModelsV2(); - END IF; -END$$ -GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsIfNotCachedV2 TO impress2020$$ - -DELIMITER ; diff --git a/src/server/loaders.js b/src/server/loaders.js index ce788dd..f017eef 100644 --- a/src/server/loaders.js +++ b/src/server/loaders.js @@ -410,8 +410,6 @@ const buildNewestItemsLoader = (db, loaders) => return [entities]; }); -let lastKnownUpdate = "1970-01-01"; // start it out very old! -let lastResult = new Map(); const buildItemsThatNeedModelsLoader = (db) => new DataLoader(async (keys) => { // Essentially, I want to take easy advantage of DataLoader's caching, for @@ -421,48 +419,65 @@ const buildItemsThatNeedModelsLoader = (db) => throw new Error(`this loader can only be loaded with the key "all"`); } - // Call the query as a procedure, defined in `setup-mysql.sql`. It will - // only run the query if modeling data has been changed since the timestamp - // we provide; otherwise, it skips the query and returns no rows, which is - // much faster! (The query takes a few seconds to run.) - // - // NOTE: This query has the colors hardcoded, we always fetch all of them! - // And then we look up the specific colors - const [results, _] = await db.query( + const [rows, _] = await db.query( + ` + SELECT T_ITEMS.item_id, + T_BODIES.color_id, + T_ITEMS.supports_vandagyre, + COUNT(*) AS modeled_species_count, + GROUP_CONCAT( + T_BODIES.species_id + ORDER BY T_BODIES.species_id + ) AS modeled_species_ids + FROM ( + -- NOTE: I found that extracting this as a separate query that runs + -- first made things WAAAY faster. Less to join/group, I guess? + SELECT DISTINCT items.id AS item_id, + swf_assets.body_id AS body_id, + -- Vandagyre was added on 2014-11-14, so we add some buffer here. + -- TODO: Some later Dyeworks items don't support Vandagyre. + -- Add a manual db flag? + items.created_at >= "2014-12-01" AS supports_vandagyre + FROM items + INNER JOIN parents_swf_assets psa ON psa.parent_type = "Item" + AND psa.parent_id = items.id + INNER JOIN swf_assets ON swf_assets.id = psa.swf_asset_id + INNER JOIN item_translations it ON it.item_id = items.id AND it.locale = "en" + WHERE items.modeling_status_hint IS NULL AND it.name NOT LIKE "%MME%" + ORDER BY item_id + ) T_ITEMS + INNER JOIN ( + SELECT DISTINCT body_id, species_id, color_id + FROM pet_types + WHERE color_id IN (6, 8, 44, 46) + ORDER BY body_id, species_id + ) T_BODIES ON T_ITEMS.body_id = T_BODIES.body_id + GROUP BY T_ITEMS.item_id, T_BODIES.color_id + HAVING NOT ( + -- No species (either an All Bodies item, or a Capsule type thing) + modeled_species_count = 0 + -- Single species (probably just their item) + OR modeled_species_count = 1 + -- All species modeled + OR modeled_species_count = 55 + -- All species modeled except Vandagyre, for items that don't support it + OR (NOT T_ITEMS.supports_vandagyre AND modeled_species_count = 54 AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54") + ) + ORDER BY T_ITEMS.item_id; ` - CALL GetItemsThatNeedModelsIfNotCachedV2(?, @LastActualUpdate); - SELECT @LastActualUpdate; - `, - [lastKnownUpdate] ); - // The query will return 2 or 3 results. - // Result 1 (optional): The rows produced by the CALL, if it ran the query. - // Or, if it skipped the query, this is omitted. - // Result 2 (required): The MySQL summary of the effects of the CALL. - // Result 3 (required): The 1-row table contianing @LastActualUpdate. - // - // So, check the number of results. If it's 3, then we should update our - // cache. Or, if it's 2, then there was no change and we can continue with - // the existing cached value. - if (results.length === 3) { - const [rawRows, __, varRows] = results; - const rows = rawRows.map(normalizeRow); + const entities = rows.map(normalizeRow); - lastKnownUpdate = varRows[0]["@LastActualUpdate"]; - - // We build lastResult into a Map up-front, to speed up the many lookups - // that the GQL resolvers will do as we group this data into GQL nodes! - lastResult = new Map(); - for (const { colorId, itemId, ...row } of rows) { - if (!lastResult.has(colorId)) { - lastResult.set(colorId, new Map()); - } - lastResult.get(colorId).set(itemId, row); + const result = new Map(); + for (const { colorId, itemId, ...entity } of entities) { + if (!result.has(colorId)) { + result.set(colorId, new Map()); } + result.get(colorId).set(itemId, entity); } - return [lastResult]; + return [result]; }); const buildItemBodiesWithAppearanceDataLoader = (db) => diff --git a/src/server/types/Item.js b/src/server/types/Item.js index 4a67461..8f55824 100644 --- a/src/server/types/Item.js +++ b/src/server/types/Item.js @@ -192,7 +192,7 @@ const typeDefs = gql` # NOTE: Most color IDs won't be accepted here. Either pass the ID of a # major special color like Baby (#6), or leave it blank for standard # bodies like Blue, Green, Red, etc. - itemsThatNeedModels(colorId: ID): [Item!]! + itemsThatNeedModels(colorId: ID): [Item!]! @cacheControl(maxAge: 1, staleWhileRevalidate: ${oneHour}) } extend type Mutation {